[MATH-1242] Improve performance of KolmogorovSmirnov two-sample test via monte carlo simulation. Thanks to Otmar Ertl.

This commit is contained in:
Thomas Neidhart 2015-06-28 10:45:23 +02:00
parent 471e6b078a
commit 6d7ee38cee
3 changed files with 75 additions and 43 deletions

View File

@ -54,9 +54,14 @@ If the output is not quite correct, check for invisible trailing spaces!
</release>
<release version="4.0" date="XXXX-XX-XX" description="">
<action dev="tn" type="fix" issue="MATH-1242" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
Improved performance to calculate the two-sample Kolmogorov-Smirnov test
via monte carlo simulation ("KolmogorovSmirnovTets#monteCarloP(...)").
</action>
<action dev="tn" type="fix" issue="MATH-1241" due-to="Aleksei Dievskii"> <!-- backported to 3.6 -->
A "StackOverflowException" was thrown when passing Double.NaN or infinity to "Gamma#digamma(double)"
or "Gamma#trigamma(double)". Now the input value is propagated to the output if it is not a real number.
A "StackOverflowException" was thrown when passing Double.NaN or
infinity to "Gamma#digamma(double)" or "Gamma#trigamma(double)".
Now the input value is propagated to the output if it is not a real number.
</action>
<action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
Improved performance of calculating the two-sample Kolmogorov-Smirnov

View File

@ -951,51 +951,47 @@ public class KolmogorovSmirnovTest {
* @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
* greater than (resp. greater than or equal to) {@code d}
*/
public double monteCarloP(double d, int n, int m, boolean strict, int iterations) {
final int[] nPlusMSet = MathArrays.natural(m + n);
final double[] nSet = new double[n];
final double[] mSet = new double[m];
public double monteCarloP(final double d, final int n, final int m, final boolean strict,
final int iterations) {
// ensure that nn is always the max of (n, m) to require fewer random numbers
final int nn = FastMath.max(n, m);
final int mm = FastMath.min(n, m);
final int sum = nn + mm;
int tail = 0;
final boolean b[] = new boolean[sum];
for (int i = 0; i < iterations; i++) {
copyPartition(nSet, mSet, nPlusMSet, n, m);
final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
if (curD > d) {
tail++;
} else if (curD == d && !strict) {
tail++;
// Shuffle n true values and m false values using Fisher-Yates shuffle algorithm.
// By processing first the n true values followed by the m false values the
// shuffle algorithm can be simplified annd requires fewer random numbers.
Arrays.fill(b, true);
for (int k = nn; k < sum; k++) {
int r = rng.nextInt(k);
b[(b[r]) ? r : k] = false;
}
int rankN = b[0] ? 1 : 0;
int rankM = b[0] ? 0 : 1;
boolean previous = b[0];
for(int j = 1; j < b.length; ++j) {
if (b[j] != previous) {
final double cdf_n = rankN / (double) nn;
final double cdf_m = rankM / (double) mm;
final double curD = FastMath.abs(cdf_n - cdf_m);
if (curD > d || (curD == d && !strict)) {
tail++;
break;
}
}
previous = b[j];
if (b[j]) {
rankN++;
} else {
rankM++;
}
}
MathArrays.shuffle(nPlusMSet, rng);
Arrays.sort(nPlusMSet, 0, n);
}
return (double) tail / iterations;
}
/**
* Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement
* relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and
* {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have
* {@code nSet = [1,4,5], mSet = [0,2,3]}.
* <p>
* <strong>Precondition:</strong> The first {@code n} elements of {@code nSetI} must be sorted
* in ascending order.
* </p>
*
* @param nSet array to fill with the first {@code n} elements of {@code nSetI}
* @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative
* to {@code m + n}
* @param nSetI array whose first {@code n} elements specify the members of {@code nSet}
* @param n number of elements in the first output array
* @param m number of elements in the second output array
*/
private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) {
int j = 0;
int k = 0;
for (int i = 0; i < n + m; i++) {
if (j < n && nSetI[j] == i) {
nSet[j++] = i;
} else {
mSet[k++] = i;
}
}
}
}

View File

@ -304,6 +304,37 @@ public class KolmogorovSmirnovTestTest {
}
}
@Test
public void testTwoSampleMonteCarloDifferentSampleSizes() {
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
final int sampleSize1 = 14;
final int sampleSize2 = 7;
final double d = 0.3;
final boolean strict = false;
final double tol = 1e-2;
Assert.assertEquals(test.exactP(d, sampleSize1, sampleSize2, strict),
test.monteCarloP(d, sampleSize1, sampleSize2, strict,
KolmogorovSmirnovTest.MONTE_CARLO_ITERATIONS),
tol);
}
/**
* Performance test for monteCarlo method. Disabled by default.
*/
// @Test
public void testTwoSampleMonteCarloPerformance() {
int numIterations = 100_000;
int N = (int)Math.sqrt(KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT);
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
for (int n = 2; n <= N; ++n) {
long startMillis = System.currentTimeMillis();
int m = KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT/n;
Assert.assertEquals(0d, test.monteCarloP(Double.POSITIVE_INFINITY, n, m, true, numIterations), 0d);
long endMillis = System.currentTimeMillis();
System.out.println("n=" + n + ", m=" + m + ", time=" + (endMillis-startMillis)/1000d + "s");
}
}
@Test
public void testTwoSampleWithManyTies() {
// MATH-1197