diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 62f682eb6..ad6f88a54 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -54,9 +54,14 @@ If the output is not quite correct, check for invisible trailing spaces! + + Improved performance to calculate the two-sample Kolmogorov-Smirnov test + via monte carlo simulation ("KolmogorovSmirnovTets#monteCarloP(...)"). + - A "StackOverflowException" was thrown when passing Double.NaN or infinity to "Gamma#digamma(double)" - or "Gamma#trigamma(double)". Now the input value is propagated to the output if it is not a real number. + A "StackOverflowException" was thrown when passing Double.NaN or + infinity to "Gamma#digamma(double)" or "Gamma#trigamma(double)". + Now the input value is propagated to the output if it is not a real number. Improved performance of calculating the two-sample Kolmogorov-Smirnov diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java index a664cda37..4e52cd672 100644 --- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java @@ -951,51 +951,47 @@ public class KolmogorovSmirnovTest { * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) * greater than (resp. greater than or equal to) {@code d} */ - public double monteCarloP(double d, int n, int m, boolean strict, int iterations) { - final int[] nPlusMSet = MathArrays.natural(m + n); - final double[] nSet = new double[n]; - final double[] mSet = new double[m]; + public double monteCarloP(final double d, final int n, final int m, final boolean strict, + final int iterations) { + + // ensure that nn is always the max of (n, m) to require fewer random numbers + final int nn = FastMath.max(n, m); + final int mm = FastMath.min(n, m); + final int sum = nn + mm; + int tail = 0; + final boolean b[] = new boolean[sum]; for (int i = 0; i < iterations; i++) { - copyPartition(nSet, mSet, nPlusMSet, n, m); - final double curD = kolmogorovSmirnovStatistic(nSet, mSet); - if (curD > d) { - tail++; - } else if (curD == d && !strict) { - tail++; + // Shuffle n true values and m false values using Fisher-Yates shuffle algorithm. + // By processing first the n true values followed by the m false values the + // shuffle algorithm can be simplified annd requires fewer random numbers. + Arrays.fill(b, true); + for (int k = nn; k < sum; k++) { + int r = rng.nextInt(k); + b[(b[r]) ? r : k] = false; + } + + int rankN = b[0] ? 1 : 0; + int rankM = b[0] ? 0 : 1; + boolean previous = b[0]; + for(int j = 1; j < b.length; ++j) { + if (b[j] != previous) { + final double cdf_n = rankN / (double) nn; + final double cdf_m = rankM / (double) mm; + final double curD = FastMath.abs(cdf_n - cdf_m); + if (curD > d || (curD == d && !strict)) { + tail++; + break; + } + } + previous = b[j]; + if (b[j]) { + rankN++; + } else { + rankM++; + } } - MathArrays.shuffle(nPlusMSet, rng); - Arrays.sort(nPlusMSet, 0, n); } return (double) tail / iterations; } - - /** - * Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement - * relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and - * {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have - * {@code nSet = [1,4,5], mSet = [0,2,3]}. - *

- * Precondition: The first {@code n} elements of {@code nSetI} must be sorted - * in ascending order. - *

- * - * @param nSet array to fill with the first {@code n} elements of {@code nSetI} - * @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative - * to {@code m + n} - * @param nSetI array whose first {@code n} elements specify the members of {@code nSet} - * @param n number of elements in the first output array - * @param m number of elements in the second output array - */ - private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) { - int j = 0; - int k = 0; - for (int i = 0; i < n + m; i++) { - if (j < n && nSetI[j] == i) { - nSet[j++] = i; - } else { - mSet[k++] = i; - } - } - } } diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java index 3b1c10b5e..f18640b5d 100644 --- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java @@ -304,6 +304,37 @@ public class KolmogorovSmirnovTestTest { } } + @Test + public void testTwoSampleMonteCarloDifferentSampleSizes() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); + final int sampleSize1 = 14; + final int sampleSize2 = 7; + final double d = 0.3; + final boolean strict = false; + final double tol = 1e-2; + Assert.assertEquals(test.exactP(d, sampleSize1, sampleSize2, strict), + test.monteCarloP(d, sampleSize1, sampleSize2, strict, + KolmogorovSmirnovTest.MONTE_CARLO_ITERATIONS), + tol); + } + + /** + * Performance test for monteCarlo method. Disabled by default. + */ + // @Test + public void testTwoSampleMonteCarloPerformance() { + int numIterations = 100_000; + int N = (int)Math.sqrt(KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT); + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); + for (int n = 2; n <= N; ++n) { + long startMillis = System.currentTimeMillis(); + int m = KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT/n; + Assert.assertEquals(0d, test.monteCarloP(Double.POSITIVE_INFINITY, n, m, true, numIterations), 0d); + long endMillis = System.currentTimeMillis(); + System.out.println("n=" + n + ", m=" + m + ", time=" + (endMillis-startMillis)/1000d + "s"); + } + } + @Test public void testTwoSampleWithManyTies() { // MATH-1197