Fixed error in computing discrete distribution of D statistics for small-sample

2-sample Kolmogorov-Smirnov tests. Error was causing incorrect p-values returned
by exactP and monteCarloP methods (used by default for small, mid-size samples).

JIRA: MATH-1245
This commit is contained in:
Phil Steitz 2015-07-10 11:55:58 -07:00
parent 0f6812858a
commit 32d33210a9
3 changed files with 63 additions and 4 deletions

View File

@ -54,6 +54,11 @@ If the output is not quite correct, check for invisible trailing spaces!
</release> </release>
<release version="4.0" date="XXXX-XX-XX" description=""> <release version="4.0" date="XXXX-XX-XX" description="">
<action dev="psteitz" type="fix" issue="MATH-1245">
Fixed error in computing discrete distribution of D statistics for small-sample
2-sample Kolmogorov-Smirnov tests. Error was causing incorrect p-values returned
by exactP and monteCarloP methods (used by default for small, mid-size samples).
</action>
<action dev="erans" type="update" issue="MATH-1243"> <action dev="erans" type="update" issue="MATH-1243">
Refactored implementation of the "miscrosphere projection" Refactored implementation of the "miscrosphere projection"
interpolation algorithm. interpolation algorithm.

View File

@ -21,6 +21,7 @@ import java.math.BigDecimal;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import org.apache.commons.math4.util.Precision;
import org.apache.commons.math4.distribution.RealDistribution; import org.apache.commons.math4.distribution.RealDistribution;
import org.apache.commons.math4.exception.InsufficientDataException; import org.apache.commons.math4.exception.InsufficientDataException;
import org.apache.commons.math4.exception.MathArithmeticException; import org.apache.commons.math4.exception.MathArithmeticException;
@ -884,6 +885,7 @@ public class KolmogorovSmirnovTest {
long tail = 0; long tail = 0;
final double[] nSet = new double[n]; final double[] nSet = new double[n];
final double[] mSet = new double[m]; final double[] mSet = new double[m];
final double tol = 1e-12; // d-values within tol of one another are considered equal
while (combinationsIterator.hasNext()) { while (combinationsIterator.hasNext()) {
// Generate an n-set // Generate an n-set
final int[] nSetI = combinationsIterator.next(); final int[] nSetI = combinationsIterator.next();
@ -898,9 +900,8 @@ public class KolmogorovSmirnovTest {
} }
} }
final double curD = kolmogorovSmirnovStatistic(nSet, mSet); final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
if (curD > d) { final int order = Precision.compareTo(curD, d, tol);
tail++; if (order > 0 || (order == 0 && !strict)) {
} else if (curD == d && !strict) {
tail++; tail++;
} }
} }
@ -958,6 +959,7 @@ public class KolmogorovSmirnovTest {
final int nn = FastMath.max(n, m); final int nn = FastMath.max(n, m);
final int mm = FastMath.min(n, m); final int mm = FastMath.min(n, m);
final int sum = nn + mm; final int sum = nn + mm;
final double tol = 1e-12; // d-values within tol of one another are considered equal
int tail = 0; int tail = 0;
final boolean b[] = new boolean[sum]; final boolean b[] = new boolean[sum];
@ -979,7 +981,8 @@ public class KolmogorovSmirnovTest {
final double cdf_n = rankN / (double) nn; final double cdf_n = rankN / (double) nn;
final double cdf_m = rankM / (double) mm; final double cdf_m = rankM / (double) mm;
final double curD = FastMath.abs(cdf_n - cdf_m); final double curD = FastMath.abs(cdf_n - cdf_m);
if (curD > d || (curD == d && !strict)) { final int order = Precision.compareTo(curD, d, tol);
if (order > 0 || (order == 0 && !strict)) {
tail++; tail++;
break; break;
} }

View File

@ -427,6 +427,57 @@ public class KolmogorovSmirnovTestTest {
Assert.assertEquals(1.0, test.approximateP(0, values.length, values.length), 0.); Assert.assertEquals(1.0, test.approximateP(0, values.length, values.length), 0.);
} }
} }
/**
* JIRA: MATH-1245
*
* Verify that D-values are not viewed as distinct when they are mathematically equal
* when computing p-statistics for small sample tests. Reference values are from R 3.2.0.
*/
@Test
public void testDRounding() {
final double tol = 1e-12;
final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12};
final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18};
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
Assert.assertEquals(0.0027495724090154106, test.kolmogorovSmirnovTest(x, y,false), tol);
final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13};
final double[] y1 = {0, 1, 3, 5, 7};
Assert.assertEquals(0.085914085914085896, test.kolmogorovSmirnovTest(x1, y1, false), tol);
final double[] x2 = {4, 6, 7, 8, 9, 10, 11};
final double[] y2 = {0, 1, 2, 3, 5};
Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol);
}
/**
* JIRA: MATH-1245
*
* Verify that D-values are not viewed as distinct when they are mathematically equal
* when computing p-statistics for small sample tests. Reference values are from R 3.2.0.
*/
@Test
public void testDRoundingMonteCarlo() {
final double tol = 1e-2;
final int iterations = 1000000;
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));
final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12};
final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18};
double d = test.kolmogorovSmirnovStatistic(x, y);
Assert.assertEquals(0.0027495724090154106, test.monteCarloP(d, x.length, y.length, false, iterations), tol);
final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13};
final double[] y1 = {0, 1, 3, 5, 7};
d = test.kolmogorovSmirnovStatistic(x1, y1);
Assert.assertEquals(0.085914085914085896, test.monteCarloP(d, x1.length, y1.length, false, iterations), tol);
final double[] x2 = {4, 6, 7, 8, 9, 10, 11};
final double[] y2 = {0, 1, 2, 3, 5};
d = test.kolmogorovSmirnovStatistic(x2, y2);
Assert.assertEquals(0.015151515151515027, test.monteCarloP(d, x2.length, y2.length, false, iterations), tol);
}
/** /**
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n, * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,