[MATH-1236] Improve performance of calculating the two-sample Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl.
This commit is contained in:
parent
75c2b24c68
commit
276e22858c
|
@ -54,6 +54,10 @@ If the output is not quite correct, check for invisible trailing spaces!
|
|||
</release>
|
||||
|
||||
<release version="4.0" date="XXXX-XX-XX" description="">
|
||||
<action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
|
||||
Improved performance of calculating the two-sample Kolmogorov-Smirnov
|
||||
test statistic.
|
||||
</action>
|
||||
<action dev="erans" type="fix" issue="MATH-1231">
|
||||
Lifted unnecessary restriction on constructor's argument of
|
||||
"MicrosphereInterpolator" (package "o.a.c.m.analysis.interpolation").
|
||||
|
|
|
@ -294,57 +294,29 @@ public class KolmogorovSmirnovTest {
|
|||
final int n = sx.length;
|
||||
final int m = sy.length;
|
||||
|
||||
int rankX = 0;
|
||||
int rankY = 0;
|
||||
|
||||
// Find the max difference between cdf_x and cdf_y
|
||||
double supD = 0d;
|
||||
// First walk x points
|
||||
for (int i = 0; i < n; i++) {
|
||||
final double x_i = sx[i];
|
||||
// ties can be safely ignored
|
||||
if (i > 0 && x_i == sx[i-1]) {
|
||||
continue;
|
||||
do {
|
||||
double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
|
||||
while(rankX < n && Double.compare(sx[rankX], z) == 0) {
|
||||
rankX += 1;
|
||||
}
|
||||
final double cdf_x = edf(x_i, sx);
|
||||
final double cdf_y = edf(x_i, sy);
|
||||
while(rankY < m && Double.compare(sy[rankY], z) == 0) {
|
||||
rankY += 1;
|
||||
}
|
||||
final double cdf_x = rankX / (double) n;
|
||||
final double cdf_y = rankY / (double) m;
|
||||
final double curD = FastMath.abs(cdf_x - cdf_y);
|
||||
if (curD > supD) {
|
||||
supD = curD;
|
||||
}
|
||||
}
|
||||
// Now look at y
|
||||
for (int i = 0; i < m; i++) {
|
||||
final double y_i = sy[i];
|
||||
// ties can be safely ignored
|
||||
if (i > 0 && y_i == sy[i-1]) {
|
||||
continue;
|
||||
}
|
||||
final double cdf_x = edf(y_i, sx);
|
||||
final double cdf_y = edf(y_i, sy);
|
||||
final double curD = FastMath.abs(cdf_x - cdf_y);
|
||||
if (curD > supD) {
|
||||
supD = curD;
|
||||
}
|
||||
}
|
||||
} while(rankX < n && rankY < m);
|
||||
return supD;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the empirical distribution function.
|
||||
*
|
||||
* @param x the given x
|
||||
* @param samples the observations
|
||||
* @return the empirical distribution function \(F_n(x)\)
|
||||
*/
|
||||
private double edf(final double x, final double[] samples) {
|
||||
final int n = samples.length;
|
||||
int index = Arrays.binarySearch(samples, x);
|
||||
if (index >= 0) {
|
||||
while(index < (n - 1) && samples[index+1] == x) {
|
||||
++index;
|
||||
}
|
||||
}
|
||||
return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
|
||||
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.commons.math4.stat.inference;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.commons.math4.distribution.NormalDistribution;
|
||||
import org.apache.commons.math4.distribution.UniformRealDistribution;
|
||||
import org.apache.commons.math4.random.Well19937c;
|
||||
|
@ -365,6 +367,16 @@ public class KolmogorovSmirnovTestTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoSamplesAllEqual() {
|
||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
|
||||
for (int i = 2; i < 30; ++i) {
|
||||
double[] values = new double[i];
|
||||
Arrays.fill(values, i);
|
||||
Assert.assertEquals(0., test.kolmogorovSmirnovStatistic(values, values), 0.);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
|
||||
* m, false).
|
||||
|
|
Loading…
Reference in New Issue