[MATH-1242] Improve performance of KolmogorovSmirnov two-sample test via monte carlo simulation. Thanks to Otmar Ertl.
This commit is contained in:
parent
471e6b078a
commit
6d7ee38cee
|
@ -54,9 +54,14 @@ If the output is not quite correct, check for invisible trailing spaces!
|
|||
</release>
|
||||
|
||||
<release version="4.0" date="XXXX-XX-XX" description="">
|
||||
<action dev="tn" type="fix" issue="MATH-1242" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
|
||||
Improved performance to calculate the two-sample Kolmogorov-Smirnov test
|
||||
via monte carlo simulation ("KolmogorovSmirnovTets#monteCarloP(...)").
|
||||
</action>
|
||||
<action dev="tn" type="fix" issue="MATH-1241" due-to="Aleksei Dievskii"> <!-- backported to 3.6 -->
|
||||
A "StackOverflowException" was thrown when passing Double.NaN or infinity to "Gamma#digamma(double)"
|
||||
or "Gamma#trigamma(double)". Now the input value is propagated to the output if it is not a real number.
|
||||
A "StackOverflowException" was thrown when passing Double.NaN or
|
||||
infinity to "Gamma#digamma(double)" or "Gamma#trigamma(double)".
|
||||
Now the input value is propagated to the output if it is not a real number.
|
||||
</action>
|
||||
<action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
|
||||
Improved performance of calculating the two-sample Kolmogorov-Smirnov
|
||||
|
|
|
@ -951,51 +951,47 @@ public class KolmogorovSmirnovTest {
|
|||
* @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
|
||||
* greater than (resp. greater than or equal to) {@code d}
|
||||
*/
|
||||
public double monteCarloP(double d, int n, int m, boolean strict, int iterations) {
|
||||
final int[] nPlusMSet = MathArrays.natural(m + n);
|
||||
final double[] nSet = new double[n];
|
||||
final double[] mSet = new double[m];
|
||||
public double monteCarloP(final double d, final int n, final int m, final boolean strict,
|
||||
final int iterations) {
|
||||
|
||||
// ensure that nn is always the max of (n, m) to require fewer random numbers
|
||||
final int nn = FastMath.max(n, m);
|
||||
final int mm = FastMath.min(n, m);
|
||||
final int sum = nn + mm;
|
||||
|
||||
int tail = 0;
|
||||
final boolean b[] = new boolean[sum];
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
copyPartition(nSet, mSet, nPlusMSet, n, m);
|
||||
final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
|
||||
if (curD > d) {
|
||||
tail++;
|
||||
} else if (curD == d && !strict) {
|
||||
tail++;
|
||||
// Shuffle n true values and m false values using Fisher-Yates shuffle algorithm.
|
||||
// By processing first the n true values followed by the m false values the
|
||||
// shuffle algorithm can be simplified annd requires fewer random numbers.
|
||||
Arrays.fill(b, true);
|
||||
for (int k = nn; k < sum; k++) {
|
||||
int r = rng.nextInt(k);
|
||||
b[(b[r]) ? r : k] = false;
|
||||
}
|
||||
|
||||
int rankN = b[0] ? 1 : 0;
|
||||
int rankM = b[0] ? 0 : 1;
|
||||
boolean previous = b[0];
|
||||
for(int j = 1; j < b.length; ++j) {
|
||||
if (b[j] != previous) {
|
||||
final double cdf_n = rankN / (double) nn;
|
||||
final double cdf_m = rankM / (double) mm;
|
||||
final double curD = FastMath.abs(cdf_n - cdf_m);
|
||||
if (curD > d || (curD == d && !strict)) {
|
||||
tail++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
previous = b[j];
|
||||
if (b[j]) {
|
||||
rankN++;
|
||||
} else {
|
||||
rankM++;
|
||||
}
|
||||
}
|
||||
MathArrays.shuffle(nPlusMSet, rng);
|
||||
Arrays.sort(nPlusMSet, 0, n);
|
||||
}
|
||||
return (double) tail / iterations;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies the first {@code n} elements of {@code nSetI} into {@code nSet} and its complement
|
||||
* relative to {@code m + n} into {@code mSet}. For example, if {@code m = 3}, {@code n = 3} and
|
||||
* {@code nSetI = [1,4,5,2,3,0]} then after this method returns, we will have
|
||||
* {@code nSet = [1,4,5], mSet = [0,2,3]}.
|
||||
* <p>
|
||||
* <strong>Precondition:</strong> The first {@code n} elements of {@code nSetI} must be sorted
|
||||
* in ascending order.
|
||||
* </p>
|
||||
*
|
||||
* @param nSet array to fill with the first {@code n} elements of {@code nSetI}
|
||||
* @param mSet array to fill with the {@code m} complementary elements of {@code nSet} relative
|
||||
* to {@code m + n}
|
||||
* @param nSetI array whose first {@code n} elements specify the members of {@code nSet}
|
||||
* @param n number of elements in the first output array
|
||||
* @param m number of elements in the second output array
|
||||
*/
|
||||
private void copyPartition(double[] nSet, double[] mSet, int[] nSetI, int n, int m) {
|
||||
int j = 0;
|
||||
int k = 0;
|
||||
for (int i = 0; i < n + m; i++) {
|
||||
if (j < n && nSetI[j] == i) {
|
||||
nSet[j++] = i;
|
||||
} else {
|
||||
mSet[k++] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -304,6 +304,37 @@ public class KolmogorovSmirnovTestTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoSampleMonteCarloDifferentSampleSizes() {
|
||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
|
||||
final int sampleSize1 = 14;
|
||||
final int sampleSize2 = 7;
|
||||
final double d = 0.3;
|
||||
final boolean strict = false;
|
||||
final double tol = 1e-2;
|
||||
Assert.assertEquals(test.exactP(d, sampleSize1, sampleSize2, strict),
|
||||
test.monteCarloP(d, sampleSize1, sampleSize2, strict,
|
||||
KolmogorovSmirnovTest.MONTE_CARLO_ITERATIONS),
|
||||
tol);
|
||||
}
|
||||
|
||||
/**
|
||||
* Performance test for monteCarlo method. Disabled by default.
|
||||
*/
|
||||
// @Test
|
||||
public void testTwoSampleMonteCarloPerformance() {
|
||||
int numIterations = 100_000;
|
||||
int N = (int)Math.sqrt(KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT);
|
||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
|
||||
for (int n = 2; n <= N; ++n) {
|
||||
long startMillis = System.currentTimeMillis();
|
||||
int m = KolmogorovSmirnovTest.LARGE_SAMPLE_PRODUCT/n;
|
||||
Assert.assertEquals(0d, test.monteCarloP(Double.POSITIVE_INFINITY, n, m, true, numIterations), 0d);
|
||||
long endMillis = System.currentTimeMillis();
|
||||
System.out.println("n=" + n + ", m=" + m + ", time=" + (endMillis-startMillis)/1000d + "s");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoSampleWithManyTies() {
|
||||
// MATH-1197
|
||||
|
|
Loading…
Reference in New Issue