From 276e22858cdd56f5725e0c2f7a3522df99967973 Mon Sep 17 00:00:00 2001 From: Thomas Neidhart Date: Sun, 21 Jun 2015 19:39:23 +0200 Subject: [PATCH] [MATH-1236] Improve performance of calculating the two-sample Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl. --- src/changes/changes.xml | 4 ++ .../stat/inference/KolmogorovSmirnovTest.java | 54 +++++-------------- .../inference/KolmogorovSmirnovTestTest.java | 12 +++++ 3 files changed, 29 insertions(+), 41 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 13193e5d1..cb2535171 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -54,6 +54,10 @@ If the output is not quite correct, check for invisible trailing spaces! + + Improved performance of calculating the two-sample Kolmogorov-Smirnov + test statistic. + Lifted unnecessary restriction on constructor's argument of "MicrosphereInterpolator" (package "o.a.c.m.analysis.interpolation"). diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java index 6144ff738..a664cda37 100644 --- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java @@ -294,57 +294,29 @@ public class KolmogorovSmirnovTest { final int n = sx.length; final int m = sy.length; + int rankX = 0; + int rankY = 0; + // Find the max difference between cdf_x and cdf_y double supD = 0d; - // First walk x points - for (int i = 0; i < n; i++) { - final double x_i = sx[i]; - // ties can be safely ignored - if (i > 0 && x_i == sx[i-1]) { - continue; + do { + double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY]; + while(rankX < n && Double.compare(sx[rankX], z) == 0) { + rankX += 1; } - final double cdf_x = edf(x_i, sx); - final double cdf_y = edf(x_i, sy); + while(rankY < m && Double.compare(sy[rankY], z) == 0) { + rankY += 1; + } + final double cdf_x = rankX / (double) n; + final double cdf_y = rankY / (double) m; final double curD = FastMath.abs(cdf_x - cdf_y); if (curD > supD) { supD = curD; } - } - // Now look at y - for (int i = 0; i < m; i++) { - final double y_i = sy[i]; - // ties can be safely ignored - if (i > 0 && y_i == sy[i-1]) { - continue; - } - final double cdf_x = edf(y_i, sx); - final double cdf_y = edf(y_i, sy); - final double curD = FastMath.abs(cdf_x - cdf_y); - if (curD > supD) { - supD = curD; - } - } + } while(rankX < n && rankY < m); return supD; } - /** - * Computes the empirical distribution function. - * - * @param x the given x - * @param samples the observations - * @return the empirical distribution function \(F_n(x)\) - */ - private double edf(final double x, final double[] samples) { - final int n = samples.length; - int index = Arrays.binarySearch(samples, x); - if (index >= 0) { - while(index < (n - 1) && samples[index+1] == x) { - ++index; - } - } - return index >= 0 ? (index + 1d) / n : (-index - 1d) / n; - } - /** * Computes the p-value, or observed significance level, of a one-sample Kolmogorov-Smirnov test diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java index 0a731392d..3b1c10b5e 100644 --- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java @@ -17,6 +17,8 @@ package org.apache.commons.math4.stat.inference; +import java.util.Arrays; + import org.apache.commons.math4.distribution.NormalDistribution; import org.apache.commons.math4.distribution.UniformRealDistribution; import org.apache.commons.math4.random.Well19937c; @@ -365,6 +367,16 @@ public class KolmogorovSmirnovTestTest { } + @Test + public void testTwoSamplesAllEqual() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + for (int i = 2; i < 30; ++i) { + double[] values = new double[i]; + Arrays.fill(values, i); + Assert.assertEquals(0., test.kolmogorovSmirnovStatistic(values, values), 0.); + } + } + /** * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n, * m, false).