From b0b23c179ac55334f760aa29fce262c73a909268 Mon Sep 17 00:00:00 2001
From: Gilles
Date: Tue, 7 Mar 2017 13:06:29 +0100
Subject: [PATCH] MATH-1405
As per the discussion on the JIRA page:
* Modified method "hasTies" in order to use a single way for detecting ties.
* Modified method "jitter": Add a random value that will modify the least
significant bits of the number to be jittered.
* Simplified method "fixTies".
* When NaN values are present in the data, an exception is thrown.
Formatting of Javadoc.
Added unit tests.
---
.../stat/inference/KolmogorovSmirnovTest.java | 138 ++++++++----------
.../inference/KolmogorovSmirnovTestTest.java | 90 ++++++++++--
2 files changed, 139 insertions(+), 89 deletions(-)
diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index 09999ea1e..368bc238e 100644
--- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -31,6 +31,7 @@ import org.apache.commons.math4.exception.NullArgumentException;
import org.apache.commons.math4.exception.NumberIsTooLargeException;
import org.apache.commons.math4.exception.OutOfRangeException;
import org.apache.commons.math4.exception.TooManyIterationsException;
+import org.apache.commons.math4.exception.NotANumberException;
import org.apache.commons.math4.exception.util.LocalizedFormats;
import org.apache.commons.math4.fraction.BigFraction;
import org.apache.commons.math4.fraction.BigFractionField;
@@ -243,15 +244,16 @@ public class KolmogorovSmirnovTest {
* If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
* may be used as an alternative method for estimating the p-value.
*
- * @param x first sample dataset
- * @param y second sample dataset
- * @param strict whether or not the probability to compute is expressed as a strict inequality
- * (ignored for large samples)
- * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
- * samples from the same distribution
- * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
- * least 2
- * @throws NullArgumentException if either {@code x} or {@code y} is null
+ * @param x first sample dataset.
+ * @param y second sample dataset.
+ * @param strict whether or not the probability to compute is expressed as
+ * a strict inequality (ignored for large samples).
+ * @return p-value associated with the null hypothesis that {@code x} and
+ * {@code y} represent samples from the same distribution.
+ * @throws InsufficientDataException if either {@code x} or {@code y} does
+ * not have length at least 2.
+ * @throws NullArgumentException if either {@code x} or {@code y} is null.
+ * @throws NotANumberException if the input arrays contain NaN values.
* @see #bootstrap(double[], double[], int, boolean)
*/
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
@@ -1121,80 +1123,61 @@ public class KolmogorovSmirnovTest {
}
/**
- * If there are no ties in the combined dataset formed from x and y, this
- * method is a no-op. If there are ties, a uniform random deviate in
- * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
- * minDelta is the minimum difference between unequal values in the combined
- * sample. A fixed seed is used to generate the jitter, so repeated activations
- * with the same input arrays result in the same values.
+ * If there are no ties in the combined dataset formed from x and y,
+ * this method is a no-op.
+ * If there are ties, a uniform random deviate in
+ * is added to each value in x and y, and this method overwrites the
+ * data in x and y with the jittered values.
*
- * NOTE: if there are ties in the data, this method overwrites the data in
- * x and y with the jittered values.
- *
- * @param x first sample
- * @param y second sample
+ * @param x First sample.
+ * @param y Second sample.
+ * @throw NotANumberException if any of the input arrays contain
+ * a NaN value.
*/
private static void fixTies(double[] x, double[] y) {
- final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
- if (values.length == x.length + y.length) {
- return; // There are no ties
- }
+ if (hasTies(x, y)) {
+ // Add jitter using a fixed seed (so same arguments always give same results),
+ // low-initialization-overhead generator.
+ final UniformRandomProvider rng = RandomSource.create(RandomSource.TWO_CMRES, 7654321);
- // Find the smallest difference between values, or 1 if all values are the same
- double minDelta = 1;
- double prev = values[0];
- double delta = 1;
- for (int i = 1; i < values.length; i++) {
- delta = prev - values[i];
- if (delta < minDelta) {
- minDelta = delta;
- }
- prev = values[i];
- }
- minDelta /= 2;
-
- // Add jitter using a fixed seed (so same arguments always give same results),
- // low-initialization-overhead generator.
- final UniformRandomProvider rng = RandomSource.create(RandomSource.TWO_CMRES, 654321);
-
- // It is theoretically possible that jitter does not break ties, so repeat
- // until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
- int ct = 0;
- boolean ties = true;
- do {
- jitter(x, rng, minDelta);
- jitter(y, rng, minDelta);
- ties = hasTies(x, y);
- ct++;
- // If jittering hasn't resolved ties, "minDelta" may be too small.
- minDelta *= 2;
- } while (ties && ct < 1000);
- if (ties) {
- throw new MathInternalError(); // Should never happen.
- }
+ // It is theoretically possible that jitter does not break ties, so repeat
+ // until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
+ int ct = 0;
+ boolean ties = true;
+ do {
+ jitter(x, rng, 10);
+ jitter(y, rng, 10);
+ ties = hasTies(x, y);
+ ++ct;
+ } while (ties && ct < 10);
+ if (ties) {
+ throw new MathInternalError(); // Should never happen.
+ }
+ }
}
/**
- * Returns true iff there are ties in the combined sample
- * formed from x and y.
+ * Returns true iff there are ties in the combined sample formed from
+ * x and y.
*
- * @param x first sample
- * @param y second sample
- * @return true if x and y together contain ties
+ * @param x First sample.
+ * @param y Second sample.
+ * @return true if x and y together contain ties.
+ * @throw NotANumberException if any of the input arrays contain
+ * a NaN value.
*/
private static boolean hasTies(double[] x, double[] y) {
- final HashSet values = new HashSet<>();
- for (int i = 0; i < x.length; i++) {
- if (!values.add(x[i])) {
- return true;
- }
- }
- for (int i = 0; i < y.length; i++) {
- if (!values.add(y[i])) {
- return true;
- }
- }
- return false;
+ final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
+
+ // "unique" moves NaN to the head of the output array.
+ if (Double.isNaN(values[0])) {
+ throw new NotANumberException();
+ }
+ if (values.length == x.length + y.length) {
+ return false; // There are no ties.
+ }
+
+ return true;
}
/**
@@ -1207,10 +1190,13 @@ public class KolmogorovSmirnovTest {
* @param sampler probability distribution to sample for jitter values
* @throws NullPointerException if either of the parameters is null
*/
- private static void jitter(final double[] data, final UniformRandomProvider rng, final double delta) {
+ private static void jitter(double[] data,
+ UniformRandomProvider rng,
+ int ulp) {
+ final int range = ulp * 2;
for (int i = 0; i < data.length; i++) {
- final double d = delta * (2 * rng.nextDouble() - 1);
- data[i] += d;
+ final int rand = rng.nextInt(range) - ulp;
+ data[i] += rand * Math.ulp(data[i]);
}
}
diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
index c3aa553d0..4ad08721f 100644
--- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@@ -28,6 +28,7 @@ import org.apache.commons.rng.UniformRandomProvider;
import org.apache.commons.math4.util.CombinatoricsUtils;
import org.apache.commons.math4.util.FastMath;
import org.apache.commons.math4.util.MathArrays;
+import org.apache.commons.math4.exception.NotANumberException;
import org.junit.Assert;
import org.junit.Test;
@@ -440,29 +441,92 @@ public class KolmogorovSmirnovTestTest {
@Test
public void testTwoSampleWithManyTiesAndVerySmallDelta() {
// Cf. MATH-1405
+
final double[] x = {
- 0.000000, 0.000000, 1.000000,
- 1.000000, 1.500000, 1.600000,
- 1.700000, 1.800000, 1.900000, 2.000000, 2.000000000000001
+ 0.0, 0.0,
+ 1.0, 1.0,
+ 1.5,
+ 1.6,
+ 1.7,
+ 1.8,
+ 1.9,
+ 2.0,
+ 2.000000000000001
};
final double[] y = {
- 0.000000, 0.000000, 10.000000,
- 10.000000, 11.000000, 11.000000,
- 11.000000, 15.000000, 16.000000,
- 17.000000, 18.000000, 19.000000, 20.000000, 20.000000000000001
+ 0.0, 0.0,
+ 10.0, 10.0,
+ 11.0, 11.0, 11.0,
+ 15.0,
+ 16.0,
+ 17.0,
+ 18.0,
+ 19.0,
+ 20.0,
+ 20.000000000000001
};
- // These values result in an initial calculated minDelta of 4.440892098500626E-16,
- // which is too small to jitter the existing values to new ones bc of floating-point
- // precision.
- // MATH-1405 adds functionality to iteratively increase minDelta until a noticeable
- // jitter occurs.
-
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
Assert.assertEquals(1.12173015e-5, test.kolmogorovSmirnovTest(x, y), 1e-6);
}
+ @Test
+ public void testTwoSampleWithManyTiesAndExtremeValues() {
+ // Cf. MATH-1405
+
+ final double[] largeX = {
+ Double.MAX_VALUE, Double.MAX_VALUE,
+ 1e40, 1e40,
+ 2e40, 2e40,
+ 1e30,
+ 2e30,
+ 3e30,
+ 4e30,
+ 5e10,
+ 6e10,
+ 7e10,
+ 8e10
+ };
+
+ final double[] smallY = {
+ Double.MIN_VALUE,
+ 2 * Double.MIN_VALUE,
+ 1e-40, 1e-40,
+ 2e-40, 2e-40,
+ 1e-30,
+ 2e-30,
+ 3e-30,
+ 4e-30,
+ 5e-10,
+ 6e-10,
+ 7e-10,
+ 8e-10
+ };
+
+ final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+ Assert.assertEquals(0, test.kolmogorovSmirnovTest(largeX, smallY), 1e-10);
+ }
+
+ @Test(expected=NotANumberException.class)
+ public void testTwoSampleWithTiesAndNaN1() {
+ // Cf. MATH-1405
+
+ final double[] x = { 1, Double.NaN, 3, 4 };
+ final double[] y = { 1, 2, 3, 4 };
+ new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
+ }
+
+ @Test(expected=NotANumberException.class)
+ public void testTwoSampleWithTiesAndNaN2() {
+ // Cf. MATH-1405
+
+ final double[] x = { 1, 2, 3, 4 };
+ final double[] y = { 1, 2, Double.NaN, 4 };
+
+ new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
+ }
+
@Test
public void testTwoSamplesAllEqual() {
int iterations = 10_000;