[MATH-1197] 2-sample KS statistic was wrong in case of ties.

This commit is contained in:
Thomas Neidhart 2015-04-26 21:10:01 +02:00
parent 9a0d061981
commit 870e1d3d98
3 changed files with 116 additions and 18 deletions

View File

@ -50,7 +50,11 @@ If the output is not quite correct, check for invisible trailing spaces!
<title>Commons Math Release Notes</title> <title>Commons Math Release Notes</title>
</properties> </properties>
<body> <body>
<release version="TBD" date="TBD" description="TBD"> <release version="3.6" date="XXXX-XX-XX" description="">
<action dev="tn" type="fix" issue="MATH-1197">
Computation of 2-sample Kolmogoriv-Smirnov statistic in case of ties
was not correct.
</action>
</release> </release>
<release version="3.5" date="2015-04-17" description=" <release version="3.5" date="2015-04-17" description="
This is a minor release: It combines bug fixes and new features. This is a minor release: It combines bug fixes and new features.

View File

@ -298,9 +298,13 @@ public class KolmogorovSmirnovTest {
double supD = 0d; double supD = 0d;
// First walk x points // First walk x points
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
final double cdf_x = (i + 1d) / n; final double x_i = sx[i];
final int yIndex = Arrays.binarySearch(sy, sx[i]); // ties can be safely ignored
final double cdf_y = yIndex >= 0 ? (yIndex + 1d) / m : (-yIndex - 1d) / m; if (i > 0 && x_i == sx[i-1]) {
continue;
}
final double cdf_x = edf(x_i, sx);
final double cdf_y = edf(x_i, sy);
final double curD = FastMath.abs(cdf_x - cdf_y); final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) { if (curD > supD) {
supD = curD; supD = curD;
@ -308,9 +312,13 @@ public class KolmogorovSmirnovTest {
} }
// Now look at y // Now look at y
for (int i = 0; i < m; i++) { for (int i = 0; i < m; i++) {
final double cdf_y = (i + 1d) / m; final double y_i = sy[i];
final int xIndex = Arrays.binarySearch(sx, sy[i]); // ties can be safely ignored
final double cdf_x = xIndex >= 0 ? (xIndex + 1d) / n : (-xIndex - 1d) / n; if (i > 0 && y_i == sy[i-1]) {
continue;
}
final double cdf_x = edf(y_i, sx);
final double cdf_y = edf(y_i, sy);
final double curD = FastMath.abs(cdf_x - cdf_y); final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) { if (curD > supD) {
supD = curD; supD = curD;
@ -319,6 +327,24 @@ public class KolmogorovSmirnovTest {
return supD; return supD;
} }
/**
* Computes the empirical distribution function.
*
* @param x the given x
* @param samples the observations
* @return the empirical distribution function \(F_n(x)\)
*/
private double edf(final double x, final double[] samples) {
final int n = samples.length;
int index = Arrays.binarySearch(samples, x);
if (index >= 0) {
while(index < (n - 1) && samples[index+1] == x) {
++index;
}
}
return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
}
/** /**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
@ -429,7 +455,7 @@ public class KolmogorovSmirnovTest {
return 1; return 1;
} }
if (exact) { if (exact) {
return exactK(d,n); return exactK(d, n);
} }
if (n <= 140) { if (n <= 140) {
return roundedK(d, n); return roundedK(d, n);
@ -834,8 +860,13 @@ public class KolmogorovSmirnovTest {
* @throws TooManyIterationsException if the series does not converge * @throws TooManyIterationsException if the series does not converge
*/ */
public double ksSum(double t, double tolerance, int maxIterations) { public double ksSum(double t, double tolerance, int maxIterations) {
if (t == 0.0) {
return 1.0;
}
// TODO: for small t (say less than 1), the alternative expansion in part 3 of [1] // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
// from class javadoc should be used. // from class javadoc should be used.
final double x = -2 * t * t; final double x = -2 * t * t;
int sign = -1; int sign = -1;
long i = 1; long i = 1;

View File

@ -302,6 +302,69 @@ public class KolmogorovSmirnovTestTest {
} }
} }
@Test
public void testTwoSampleWithManyTies() {
// MATH-1197
final double[] x = {
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 2.202653,
3.181199, 3.181199, 3.181199, 3.181199, 3.181199, 3.181199,
3.723539, 3.723539, 3.723539, 3.723539, 4.383482, 4.383482,
4.383482, 4.383482, 5.320671, 5.320671, 5.320671, 5.717284,
6.964001, 7.352165, 8.710510, 8.710510, 8.710510, 8.710510,
8.710510, 8.710510, 9.539004, 9.539004, 10.720619, 17.726077,
17.726077, 17.726077, 17.726077, 22.053875, 23.799144, 27.355308,
30.584960, 30.584960, 30.584960, 30.584960, 30.751808
};
final double[] y = {
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 2.202653, 2.202653, 2.202653,
2.202653, 2.202653, 2.202653, 2.202653, 2.202653, 3.061758,
3.723539, 5.628420, 5.628420, 5.628420, 5.628420, 5.628420,
6.916982, 6.916982, 6.916982, 10.178538, 10.178538, 10.178538,
10.178538, 10.178538
};
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
Assert.assertEquals(0.0640394088, test.kolmogorovSmirnovStatistic(x, y), 1e-6);
Assert.assertEquals(0.9792777290, test.kolmogorovSmirnovTest(x, y), 1e-6);
}
/** /**
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n, * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
* m, false). * m, false).