Improved performance and accuracy of 2-sample KS tests. JIRA: MATH-1310.
This commit is contained in:
parent
8bcf7e23a6
commit
e38bbb9f41
|
@ -54,6 +54,9 @@ If the output is not quite correct, check for invisible trailing spaces!
|
|||
</release>
|
||||
|
||||
<release version="4.0" date="XXXX-XX-XX" description="">
|
||||
<action dev="psteitz" type="fix" issue="MATH-1310"> <!-- backported to 3.6 -->
|
||||
Improved performance and accuracy of 2-sample KolmogorovSmirnov tests.
|
||||
</action>
|
||||
<action dev="erans" type="fix" issue="MATH-1308">
|
||||
Removed obsolete class "AbstractRandomGenerator" (package "o.a.c.m.random").
|
||||
</action>
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.commons.math4.stat.inference;
|
|||
import java.math.BigDecimal;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.math4.distribution.EnumeratedRealDistribution;
|
||||
import org.apache.commons.math4.distribution.RealDistribution;
|
||||
|
@ -69,14 +68,9 @@ import org.apache.commons.math4.util.MathUtils;
|
|||
* default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
|
||||
* follows:
|
||||
* <ul>
|
||||
* <li>For very small samples (where the product of the sample sizes is less than
|
||||
* {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value for the
|
||||
* 2-sample test.</li>
|
||||
* <li>For mid-size samples (product of sample sizes greater than or equal to
|
||||
* {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
|
||||
* simulation is used to compute the p-value. The simulation randomly generates partitions of \(m +
|
||||
* n\) into an \(m\)-set and an \(n\)-set and reports the proportion that give \(D\) values
|
||||
* exceeding the observed value.</li>
|
||||
* <li>For small samples (where the product of the sample sizes is less than
|
||||
* {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the
|
||||
* exact p-value for the 2-sample test.</li>
|
||||
* <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
|
||||
* distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
|
||||
* the approximation.</li>
|
||||
|
@ -97,8 +91,6 @@ import org.apache.commons.math4.util.MathUtils;
|
|||
* The methods used by the 2-sample default implementation are also exposed directly:
|
||||
* <ul>
|
||||
* <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
|
||||
* <li>{@link #monteCarloP(double, int, int, boolean, int)} computes 2-sample p-values by Monte
|
||||
* Carlo simulation</li>
|
||||
* <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
|
||||
* arguments in the first two methods allow the probability used to estimate the p-value to be
|
||||
* expressed using strict or non-strict inequality. See
|
||||
|
@ -115,6 +107,8 @@ import org.apache.commons.math4.util.MathUtils;
|
|||
* <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
|
||||
* Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
|
||||
* The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
|
||||
* <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
|
||||
* Chapter 5, 3rd Ed. Academic Press.</li>
|
||||
* </ul>
|
||||
* <br/>
|
||||
* Note that [1] contains an error in computing h, refer to <a
|
||||
|
@ -136,16 +130,19 @@ public class KolmogorovSmirnovTest {
|
|||
/** Convergence criterion for the sums in #pelzGood(double, double, int)} */
|
||||
protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
|
||||
|
||||
/** When product of sample sizes is less than this value, 2-sample K-S test is exact */
|
||||
/** No longer used. */
|
||||
@Deprecated
|
||||
protected static final int SMALL_SAMPLE_PRODUCT = 200;
|
||||
|
||||
/**
|
||||
* When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
|
||||
* distribution for strict inequality p-value.
|
||||
* distribution to compute the p-value.
|
||||
*/
|
||||
protected static final int LARGE_SAMPLE_PRODUCT = 10000;
|
||||
|
||||
/** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)} */
|
||||
/** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}.
|
||||
* Deprecated as of version 3.6, as this method is no longer needed. */
|
||||
@Deprecated
|
||||
protected static final int MONTE_CARLO_ITERATIONS = 1000000;
|
||||
|
||||
/** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
|
||||
|
@ -160,9 +157,12 @@ public class KolmogorovSmirnovTest {
|
|||
|
||||
/**
|
||||
* Construct a KolmogorovSmirnovTest with the provided random data generator.
|
||||
* The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this
|
||||
* constructor is deprecated as of version 3.6.
|
||||
*
|
||||
* @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
|
||||
*/
|
||||
@Deprecated
|
||||
public KolmogorovSmirnovTest(RandomGenerator rng) {
|
||||
this.rng = rng;
|
||||
}
|
||||
|
@ -226,18 +226,9 @@ public class KolmogorovSmirnovTest {
|
|||
* {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
|
||||
* large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
|
||||
* <ul>
|
||||
* <li>For very small samples (where the product of the sample sizes is less than
|
||||
* {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
|
||||
* is accomplished by enumerating all partitions of the combined sample into two subsamples of
|
||||
* the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
|
||||
* proportion of partitions that give \(D\) values exceeding the observed value. In the very
|
||||
* small sample case, if there are ties in the data, the actual sample values (including ties)
|
||||
* are used in generating the partitions (which are basically multi-set partitions in this
|
||||
* case).</li>
|
||||
* <li>For mid-size samples (product of sample sizes greater than or equal to
|
||||
* {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
|
||||
* simulation is used to compute the p-value. The simulation randomly generates partitions and
|
||||
* reports the proportion that give \(D\) values exceeding the observed value.</li>
|
||||
* <li>For small samples (where the product of the sample sizes is less than
|
||||
* {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented
|
||||
* in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li>
|
||||
* <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
|
||||
* asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
|
||||
* for details on the approximation.</li>
|
||||
|
@ -274,11 +265,8 @@ public class KolmogorovSmirnovTest {
|
|||
xa = x;
|
||||
ya = y;
|
||||
}
|
||||
if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
|
||||
return integralExactP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length);
|
||||
}
|
||||
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
|
||||
return integralMonteCarloP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
|
||||
return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
|
||||
}
|
||||
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
|
||||
}
|
||||
|
@ -1000,16 +988,8 @@ public class KolmogorovSmirnovTest {
|
|||
* d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
|
||||
* {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
|
||||
* <p>
|
||||
* The returned probability is exact, obtained by enumerating all partitions of {@code m + n}
|
||||
* into {@code m} and {@code n} sets, computing \(D_{n,m}\) for each partition and counting the
|
||||
* number of partitions that yield \(D_{n,m}\) values exceeding (resp. greater than or equal to)
|
||||
* {@code d}.
|
||||
* </p>
|
||||
* <p>
|
||||
* <strong>USAGE NOTE</strong>: Since this method enumerates all combinations in \({m+n} \choose
|
||||
* {n}\), it is very slow if called for large {@code m, n}. For this reason,
|
||||
* {@link #kolmogorovSmirnovTest(double[], double[])} uses this only for {@code m * n < }
|
||||
* {@value #SMALL_SAMPLE_PRODUCT}.
|
||||
* The returned probability is exact, implemented by unwinding the recursive function
|
||||
* definitions presented in [4] (class javadoc).
|
||||
* </p>
|
||||
*
|
||||
* @param d D-statistic value
|
||||
|
@ -1020,50 +1000,10 @@ public class KolmogorovSmirnovTest {
|
|||
* greater than (resp. greater than or equal to) {@code d}
|
||||
*/
|
||||
public double exactP(double d, int n, int m, boolean strict) {
|
||||
return integralExactP(calculateIntegralD(d, n, m, strict), n, m);
|
||||
return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
|
||||
CombinatoricsUtils.binomialCoefficientDouble(n + m, m);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the
|
||||
* 2-sample Kolmogorov-Smirnov statistic.
|
||||
* <p>
|
||||
* Here d is the D-statistic represented as long value.
|
||||
* The real D-statistic is obtained by dividing d by n*m.
|
||||
* See also {@link #exactP(double, int, int, boolean)}.
|
||||
*
|
||||
* @param d integral D-statistic
|
||||
* @param n first sample size
|
||||
* @param m second sample size
|
||||
* @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
|
||||
* greater than or equal to {@code d/(n*m)}
|
||||
*/
|
||||
private double integralExactP(long d, int n, int m) {
|
||||
Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
|
||||
long tail = 0;
|
||||
final double[] nSet = new double[n];
|
||||
final double[] mSet = new double[m];
|
||||
while (combinationsIterator.hasNext()) {
|
||||
// Generate an n-set
|
||||
final int[] nSetI = combinationsIterator.next();
|
||||
// Copy the n-set to nSet and its complement to mSet
|
||||
int j = 0;
|
||||
int k = 0;
|
||||
for (int i = 0; i < n + m; i++) {
|
||||
if (j < n && nSetI[j] == i) {
|
||||
nSet[j++] = i;
|
||||
} else {
|
||||
mSet[k++] = i;
|
||||
}
|
||||
}
|
||||
final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
|
||||
if (curD >= d) {
|
||||
tail++;
|
||||
}
|
||||
}
|
||||
return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
|
||||
* is the 2-sample Kolmogorov-Smirnov statistic. See
|
||||
|
@ -1270,4 +1210,61 @@ public class KolmogorovSmirnovTest {
|
|||
data[i] += dist.sample();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The function C(i, j) defined in [4] (class javadoc), formula (5.5).
|
||||
* defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
|
||||
* and recoded as a long to avoid rounding errors in comparison tests, so what
|
||||
* is actually tested is |im - jn| <= cmn.
|
||||
*
|
||||
* @param i first path parameter
|
||||
* @param j second path paramter
|
||||
* @param m first sample size
|
||||
* @param n second sample size
|
||||
* @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
|
||||
* @param strict whether or not the null hypothesis uses strict inequality
|
||||
* @return C(i,j) for given m, n, c
|
||||
*/
|
||||
private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
|
||||
if (strict) {
|
||||
return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
|
||||
}
|
||||
return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* The function N(i, j) defined in [4] (class javadoc).
|
||||
* Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
|
||||
* from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
|
||||
* The return value is integral, but subject to overflow, so it is maintained and
|
||||
* returned as a double.
|
||||
*
|
||||
* @param i first path parameter
|
||||
* @param j second path parameter
|
||||
* @param m first sample size
|
||||
* @param n second sample size
|
||||
* @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
|
||||
* @param strict whether or not the null hypothesis uses strict inequality
|
||||
* @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
|
||||
*/
|
||||
private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
|
||||
/*
|
||||
* Unwind the recursive definition given in [4].
|
||||
* Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
|
||||
* When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
|
||||
*/
|
||||
final double[] lag = new double[n];
|
||||
double last = 0;
|
||||
for (int k = 0; k < n; k++) {
|
||||
lag[k] = c(0, k + 1, m, n, cnm, strict);
|
||||
}
|
||||
for (int k = 1; k <= i; k++) {
|
||||
last = c(k, 0, m, n, cnm, strict);
|
||||
for (int l = 1; l <= j; l++) {
|
||||
lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
|
||||
last = lag[l - 1];
|
||||
}
|
||||
}
|
||||
return last;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -156,6 +156,12 @@ uniform <- c(0.7930305, 0.6424382, 0.8747699, 0.7156518, 0.1845909, 0.2022326,
|
|||
|
||||
smallSample1 <- c(6, 7, 9, 13, 19, 21, 22, 23, 24)
|
||||
smallSample2 <- c(10, 11, 12, 16, 20, 27, 28, 32, 44, 54)
|
||||
smallSample3 <- c(6, 7, 9, 13, 19, 21, 22, 23, 24, 29, 30, 34, 36, 41, 45, 47, 51, 63, 33, 91)
|
||||
smallSample4 <- c(10, 11, 12, 16, 20, 27, 28, 32, 44, 54, 56, 57, 64, 69, 71, 80, 81, 88, 90)
|
||||
smallSample5 <- c(-10, -5, 17, 21, 22, 23, 24, 30, 44, 50, 56, 57, 59, 67, 73, 75, 77, 78, 79, 80, 81, 83, 84, 85, 88, 90,
|
||||
92, 93, 94, 95, 98, 100, 101, 103, 105, 110)
|
||||
smallSample6 <- c(-2, -1, 0, 10, 14, 15, 16, 20, 25, 26, 27, 31, 32, 33, 34, 45, 47, 48, 51, 52, 53, 54, 60, 61, 62, 63,
|
||||
74, 82, 106, 107, 109, 11, 112, 113, 114)
|
||||
bootSample1 <- c(0, 2, 4, 6, 8, 8, 10, 15, 22, 30, 33, 36, 38)
|
||||
bootSample2 <- c(9, 17, 20, 33, 40, 51, 60, 60, 72, 90, 101)
|
||||
roundingSample1 <- c(2,4,6,8,9,10,11,12,13)
|
||||
|
@ -185,7 +191,13 @@ verifyTwoSampleLargeSamples(gaussian, gaussian2, 0.0319983962391632, 0.202352941
|
|||
"Two sample N(0, 1) vs N(0, 1.6)")
|
||||
|
||||
verifyTwoSampleSmallSamplesExact(smallSample1, smallSample2, 0.105577085453247, .5, tol,
|
||||
"Two sample small samples exact")
|
||||
"Two sample small samples exact 1")
|
||||
|
||||
verifyTwoSampleSmallSamplesExact(smallSample3, smallSample4, 0.046298660942952, 0.426315789473684, tol,
|
||||
"Two sample small samples exact 2")
|
||||
|
||||
verifyTwoSampleSmallSamplesExact(smallSample5, smallSample6, 0.00300743602233366, 0.41031746031746, tol,
|
||||
"Two sample small samples exact 3")
|
||||
|
||||
verifyTwoSampleBootstrap(bootSample1, bootSample2, 0.0059, 1E-3, "Two sample bootstrap - isolated failures possible")
|
||||
verifyTwoSampleBootstrap(gaussian, gaussian2, 0.0237, 1E-2, "Two sample bootstrap - isolated failures possible")
|
||||
|
|
|
@ -31,7 +31,7 @@ tol <- 1E-9
|
|||
|
||||
# Function definitions
|
||||
|
||||
library(VGAM)
|
||||
require("VGAM")
|
||||
source("testFunctions") # utility test functions
|
||||
|
||||
# function to verify distribution computations
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.commons.math4.distribution.NormalDistribution;
|
|||
import org.apache.commons.math4.distribution.UniformRealDistribution;
|
||||
import org.apache.commons.math4.random.RandomGenerator;
|
||||
import org.apache.commons.math4.random.Well19937c;
|
||||
import org.apache.commons.math4.stat.inference.KolmogorovSmirnovTest;
|
||||
import org.apache.commons.math4.util.CombinatoricsUtils;
|
||||
import org.apache.commons.math4.util.FastMath;
|
||||
import org.apache.commons.math4.util.MathArrays;
|
||||
|
@ -181,12 +180,48 @@ public class KolmogorovSmirnovTestTest {
|
|||
final double[] smallSample2 = {
|
||||
10, 11, 12, 16, 20, 27, 28, 32, 44, 54
|
||||
};
|
||||
// Reference values from R, version 2.15.3 - R uses non-strict inequality in null hypothesis
|
||||
// Reference values from R, version 3.2.0 - R uses non-strict inequality in null hypothesis
|
||||
Assert
|
||||
.assertEquals(0.105577085453247, test.kolmogorovSmirnovTest(smallSample1, smallSample2, false), TOLERANCE);
|
||||
Assert.assertEquals(0.5, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE);
|
||||
}
|
||||
|
||||
/** Small samples - exact p-value, checked against R */
|
||||
@Test
|
||||
public void testTwoSampleSmallSampleExact2() {
|
||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
|
||||
final double[] smallSample1 = {
|
||||
6, 7, 9, 13, 19, 21, 22, 23, 24, 29, 30, 34, 36, 41, 45, 47, 51, 63, 33, 91
|
||||
};
|
||||
final double[] smallSample2 = {
|
||||
10, 11, 12, 16, 20, 27, 28, 32, 44, 54, 56, 57, 64, 69, 71, 80, 81, 88, 90
|
||||
};
|
||||
// Reference values from R, version 3.2.0 - R uses non-strict inequality in null hypothesis
|
||||
Assert
|
||||
.assertEquals(0.0462986609, test.kolmogorovSmirnovTest(smallSample1, smallSample2, false), TOLERANCE);
|
||||
Assert.assertEquals(0.4263157895, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE);
|
||||
}
|
||||
|
||||
/** Small samples - exact p-value, checked against R */
|
||||
@Test
|
||||
public void testTwoSampleSmallSampleExact3() {
|
||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
|
||||
final double[] smallSample1 = {
|
||||
-10, -5, 17, 21, 22, 23, 24, 30, 44, 50, 56, 57, 59, 67, 73, 75, 77, 78, 79, 80, 81, 83, 84, 85, 88, 90,
|
||||
92, 93, 94, 95, 98, 100, 101, 103, 105, 110
|
||||
};
|
||||
final double[] smallSample2 = {
|
||||
-2, -1, 0, 10, 14, 15, 16, 20, 25, 26, 27, 31, 32, 33, 34, 45, 47, 48, 51, 52, 53, 54, 60, 61, 62, 63,
|
||||
74, 82, 106, 107, 109, 11, 112, 113, 114
|
||||
};
|
||||
// Reference values from R, version 3.2.0 - R uses non-strict inequality in null hypothesis
|
||||
Assert
|
||||
.assertEquals(0.00300743602, test.kolmogorovSmirnovTest(smallSample1, smallSample2, false), TOLERANCE);
|
||||
Assert.assertEquals(0.4103174603, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE);
|
||||
Assert
|
||||
.assertEquals(0.00300743602, test.kolmogorovSmirnovTest(smallSample2, smallSample1, false), TOLERANCE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks exact p-value computations using critical values from Table 9 in V.K Rohatgi, An
|
||||
* Introduction to Probability and Mathematical Statistics, Wiley, 1976, ISBN 0-471-73135-8.
|
||||
|
@ -279,10 +314,9 @@ public class KolmogorovSmirnovTestTest {
|
|||
}
|
||||
|
||||
/**
|
||||
* Verifies that Monte Carlo simulation gives results close to exact p values. This test is a
|
||||
* little long-running (more than two minutes on a fast machine), so is disabled by default.
|
||||
* Verifies that Monte Carlo simulation gives results close to exact p values.
|
||||
*/
|
||||
// @Test
|
||||
@Test
|
||||
public void testTwoSampleMonteCarlo() {
|
||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
|
||||
final int sampleSize = 14;
|
||||
|
|
Loading…
Reference in New Issue