From 32d33210a92b1197a6c5a07f19aa25426af72723 Mon Sep 17 00:00:00 2001 From: Phil Steitz Date: Fri, 10 Jul 2015 11:55:58 -0700 Subject: [PATCH] Fixed error in computing discrete distribution of D statistics for small-sample 2-sample Kolmogorov-Smirnov tests. Error was causing incorrect p-values returned by exactP and monteCarloP methods (used by default for small, mid-size samples). JIRA: MATH-1245 --- src/changes/changes.xml | 5 ++ .../stat/inference/KolmogorovSmirnovTest.java | 11 ++-- .../inference/KolmogorovSmirnovTestTest.java | 51 +++++++++++++++++++ 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 9e983cf5b..b6baf47b7 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -54,6 +54,11 @@ If the output is not quite correct, check for invisible trailing spaces! + + Fixed error in computing discrete distribution of D statistics for small-sample + 2-sample Kolmogorov-Smirnov tests. Error was causing incorrect p-values returned + by exactP and monteCarloP methods (used by default for small, mid-size samples). + Refactored implementation of the "miscrosphere projection" interpolation algorithm. diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java index f2ccb5cc9..c0e7e512c 100644 --- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java @@ -21,6 +21,7 @@ import java.math.BigDecimal; import java.util.Arrays; import java.util.Iterator; +import org.apache.commons.math4.util.Precision; import org.apache.commons.math4.distribution.RealDistribution; import org.apache.commons.math4.exception.InsufficientDataException; import org.apache.commons.math4.exception.MathArithmeticException; @@ -884,6 +885,7 @@ public class KolmogorovSmirnovTest { long tail = 0; final double[] nSet = new double[n]; final double[] mSet = new double[m]; + final double tol = 1e-12; // d-values within tol of one another are considered equal while (combinationsIterator.hasNext()) { // Generate an n-set final int[] nSetI = combinationsIterator.next(); @@ -898,9 +900,8 @@ public class KolmogorovSmirnovTest { } } final double curD = kolmogorovSmirnovStatistic(nSet, mSet); - if (curD > d) { - tail++; - } else if (curD == d && !strict) { + final int order = Precision.compareTo(curD, d, tol); + if (order > 0 || (order == 0 && !strict)) { tail++; } } @@ -958,6 +959,7 @@ public class KolmogorovSmirnovTest { final int nn = FastMath.max(n, m); final int mm = FastMath.min(n, m); final int sum = nn + mm; + final double tol = 1e-12; // d-values within tol of one another are considered equal int tail = 0; final boolean b[] = new boolean[sum]; @@ -979,7 +981,8 @@ public class KolmogorovSmirnovTest { final double cdf_n = rankN / (double) nn; final double cdf_m = rankM / (double) mm; final double curD = FastMath.abs(cdf_n - cdf_m); - if (curD > d || (curD == d && !strict)) { + final int order = Precision.compareTo(curD, d, tol); + if (order > 0 || (order == 0 && !strict)) { tail++; break; } diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java index e369c2a84..3735721f6 100644 --- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java @@ -427,6 +427,57 @@ public class KolmogorovSmirnovTestTest { Assert.assertEquals(1.0, test.approximateP(0, values.length, values.length), 0.); } } + + /** + * JIRA: MATH-1245 + * + * Verify that D-values are not viewed as distinct when they are mathematically equal + * when computing p-statistics for small sample tests. Reference values are from R 3.2.0. + */ + @Test + public void testDRounding() { + final double tol = 1e-12; + final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12}; + final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18}; + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + Assert.assertEquals(0.0027495724090154106, test.kolmogorovSmirnovTest(x, y,false), tol); + + final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13}; + final double[] y1 = {0, 1, 3, 5, 7}; + Assert.assertEquals(0.085914085914085896, test.kolmogorovSmirnovTest(x1, y1, false), tol); + + final double[] x2 = {4, 6, 7, 8, 9, 10, 11}; + final double[] y2 = {0, 1, 2, 3, 5}; + Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol); + } + + /** + * JIRA: MATH-1245 + * + * Verify that D-values are not viewed as distinct when they are mathematically equal + * when computing p-statistics for small sample tests. Reference values are from R 3.2.0. + */ + @Test + public void testDRoundingMonteCarlo() { + final double tol = 1e-2; + final int iterations = 1000000; + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); + + final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12}; + final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18}; + double d = test.kolmogorovSmirnovStatistic(x, y); + Assert.assertEquals(0.0027495724090154106, test.monteCarloP(d, x.length, y.length, false, iterations), tol); + + final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13}; + final double[] y1 = {0, 1, 3, 5, 7}; + d = test.kolmogorovSmirnovStatistic(x1, y1); + Assert.assertEquals(0.085914085914085896, test.monteCarloP(d, x1.length, y1.length, false, iterations), tol); + + final double[] x2 = {4, 6, 7, 8, 9, 10, 11}; + final double[] y2 = {0, 1, 2, 3, 5}; + d = test.kolmogorovSmirnovStatistic(x2, y2); + Assert.assertEquals(0.015151515151515027, test.monteCarloP(d, x2.length, y2.length, false, iterations), tol); + } /** * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,