[MATH-1236] Improve performance of calculating the two-sample Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl.

This commit is contained in:
Thomas Neidhart 2015-06-21 19:39:23 +02:00
parent 75c2b24c68
commit 276e22858c
3 changed files with 29 additions and 41 deletions

View File

@ -54,6 +54,10 @@ If the output is not quite correct, check for invisible trailing spaces!
</release>
<release version="4.0" date="XXXX-XX-XX" description="">
<action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
Improved performance of calculating the two-sample Kolmogorov-Smirnov
test statistic.
</action>
<action dev="erans" type="fix" issue="MATH-1231">
Lifted unnecessary restriction on constructor's argument of
"MicrosphereInterpolator" (package "o.a.c.m.analysis.interpolation").

View File

@ -294,57 +294,29 @@ public class KolmogorovSmirnovTest {
final int n = sx.length;
final int m = sy.length;
int rankX = 0;
int rankY = 0;
// Find the max difference between cdf_x and cdf_y
double supD = 0d;
// First walk x points
for (int i = 0; i < n; i++) {
final double x_i = sx[i];
// ties can be safely ignored
if (i > 0 && x_i == sx[i-1]) {
continue;
do {
double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
while(rankX < n && Double.compare(sx[rankX], z) == 0) {
rankX += 1;
}
final double cdf_x = edf(x_i, sx);
final double cdf_y = edf(x_i, sy);
while(rankY < m && Double.compare(sy[rankY], z) == 0) {
rankY += 1;
}
final double cdf_x = rankX / (double) n;
final double cdf_y = rankY / (double) m;
final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) {
supD = curD;
}
}
// Now look at y
for (int i = 0; i < m; i++) {
final double y_i = sy[i];
// ties can be safely ignored
if (i > 0 && y_i == sy[i-1]) {
continue;
}
final double cdf_x = edf(y_i, sx);
final double cdf_y = edf(y_i, sy);
final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) {
supD = curD;
}
}
} while(rankX < n && rankY < m);
return supD;
}
/**
* Computes the empirical distribution function.
*
* @param x the given x
* @param samples the observations
* @return the empirical distribution function \(F_n(x)\)
*/
private double edf(final double x, final double[] samples) {
final int n = samples.length;
int index = Arrays.binarySearch(samples, x);
if (index >= 0) {
while(index < (n - 1) && samples[index+1] == x) {
++index;
}
}
return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
}
/**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>

View File

@ -17,6 +17,8 @@
package org.apache.commons.math4.stat.inference;
import java.util.Arrays;
import org.apache.commons.math4.distribution.NormalDistribution;
import org.apache.commons.math4.distribution.UniformRealDistribution;
import org.apache.commons.math4.random.Well19937c;
@ -365,6 +367,16 @@ public class KolmogorovSmirnovTestTest {
}
@Test
public void testTwoSamplesAllEqual() {
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
for (int i = 2; i < 30; ++i) {
double[] values = new double[i];
Arrays.fill(values, i);
Assert.assertEquals(0., test.kolmogorovSmirnovStatistic(values, values), 0.);
}
}
/**
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
* m, false).