[MATH-1242] Improve performance of KolmogorovSmirnov two-sample test via monte carlo simulation. Thanks to Otmar Ertl.

2015-06-28 10:45:23 +02:00 · 2015-06-28 10:45:23 +02:00 · 6d7ee38cee
parent 471e6b078a
commit 6d7ee38cee
3 changed files with 75 additions and 43 deletions
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@ -54,9 +54,14 @@ If the output is not quite correct, check for invisible trailing spaces!
    </release>

    <release version="4.0" date="XXXX-XX-XX" description="">
+      <action dev="tn" type="fix" issue="MATH-1242" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
+        Improved performance to calculate the two-sample Kolmogorov-Smirnov test
+        via monte carlo simulation ("KolmogorovSmirnovTets#monteCarloP(...)").
+      </action>
      <action dev="tn" type="fix" issue="MATH-1241" due-to="Aleksei Dievskii"> <!-- backported to 3.6 -->
-        A "StackOverflowException" was thrown when passing Double.NaN or infinity to "Gamma#digamma(double)"
-        or "Gamma#trigamma(double)". Now the input value is propagated to the output if it is not a real number.
+        A "StackOverflowException" was thrown when passing Double.NaN or
+        infinity to "Gamma#digamma(double)" or "Gamma#trigamma(double)".
+        Now the input value is propagated to the output if it is not a real number.
      </action>
      <action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
        Improved performance of calculating the two-sample Kolmogorov-Smirnov
--- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@ -951,51 +951,47 @@ public class KolmogorovSmirnovTest {
     * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
     *         greater than (resp. greater than or equal to) {@code d}
     */
-    public double monteCarloP(double d, int n, int m, boolean strict, int iterations) {
-        final int[] nPlusMSet = MathArrays.natural(m + n);
-        final double[] nSet = new double[n];
-        final double[] mSet = new double[m];
+    public double monteCarloP(final double d, final int n, final int m, final boolean strict,
+                              final int iterations) {
+
+        // ensure that nn is always the max of (n, m) to require fewer random numbers
+        final int nn = FastMath.max(n, m);
+        final int mm = FastMath.min(n, m);
+        final int sum = nn + mm;
+
        int tail = 0;
+        final boolean b[] = new boolean[sum];
        for (int i = 0; i < iterations; i++) {
-            copyPartition(nSet, mSet, nPlusMSet, n, m);
-            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
-            if (curD > d) {
-                tail++;
-            } else if (curD == d && !strict) {
-                tail++;
+            // Shuffle n true values and m false values using Fisher-Yates shuffle algorithm.
+            // By processing first the n true values followed by the m false values the
+            // shuffle algorithm can be simplified annd requires fewer random numbers.
+            Arrays.fill(b, true);
+            for (int k = nn; k < sum; k++) {
+                int r = rng.nextInt(k);
+                b[(b[r]) ? r : k] = false;
+            }
+
+            int rankN = b[0] ? 1 : 0;
+            int rankM = b[0] ? 0 : 1;
+            boolean previous = b[0];
+            for(int j = 1; j < b.length; ++j) {
+                if (b[j] != previous) {
+                    final double cdf_n = rankN / (double) nn;
+                    final double cdf_m = rankM / (double) mm;
+                    final double curD = FastMath.abs(cdf_n - cdf_m);
+                    if (curD > d || (curD == d && !strict)) {
+                        tail++;
+                        break;
+                    }
+                }
+                previous = b[j];
+                if (b[j]) {
+                    rankN++;
+                } else {
+                    rankM++;
+                }
            }
-            MathArrays.shuffle(nPlusMSet, rng);
-            Arrays.sort(nPlusMSet, 0, n);
        }
        return (double) tail / iterations;
    }
-
-    /**
-     * Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement
-     * relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and
-     * {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have
-     * {@code nSet = [1,4,5], mSet = [0,2,3]}.
-     * <p>
-     * <strong>Precondition:</strong> The first {@code n} elements of {@code nSetI} must be sorted
-     * in ascending order.
-     * </p>
-     *
-     * @param nSet array to fill with the first {@code n} elements of {@code nSetI}
-     * @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative
-     *        to {@code m + n}
-     * @param nSetI array whose first {@code n} elements specify the members of {@code nSet}
-     * @param n number of elements in the first output array
-     * @param m number of elements in the second output array
-     */
-    private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) {
-        int j = 0;
-        int k = 0;
-        for (int i = 0; i < n + m; i++) {
-            if (j < n && nSetI[j] == i) {
-                nSet[j++] = i;
-            } else {
-                mSet[k++] = i;
-            }
-        }
-    }
 }
--- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@ -304,6 +304,37 @@ public class KolmogorovSmirnovTestTest {
        }
    }

+    @Test
+    public void testTwoSampleMonteCarloDifferentSampleSizes() {
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
+        final int sampleSize1 = 14;
+        final int sampleSize2 = 7;
+        final double d = 0.3;
+        final boolean strict = false;
+        final double tol = 1e-2;
+        Assert.assertEquals(test.exactP(d, sampleSize1, sampleSize2, strict),
+                            test.monteCarloP(d, sampleSize1, sampleSize2, strict,
+                                             KolmogorovSmirnovTest.MONTE_CARLO_ITERATIONS),
+                            tol);
+    }
+
+    /**
+     * Performance test for monteCarlo method. Disabled by default.
+     */
+    // @Test
+    public void testTwoSampleMonteCarloPerformance() {
+        int numIterations = 100_000;
+        int N = (int)Math.sqrt(KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT);
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
+        for (int n = 2; n <= N; ++n) {
+            long startMillis = System.currentTimeMillis();
+            int m = KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT/n;
+            Assert.assertEquals(0d, test.monteCarloP(Double.POSITIVE_INFINITY, n, m, true, numIterations), 0d);
+            long endMillis = System.currentTimeMillis();
+            System.out.println("n=" + n + ", m=" + m + ", time=" + (endMillis-startMillis)/1000d + "s");
+        }
+    }
+
    @Test
    public void testTwoSampleWithManyTies() {
        // MATH-1197