From f7ab3a70ec426669398b4f16d0f2dd5458d87a2e Mon Sep 17 00:00:00 2001 From: Phil Steitz Date: Thu, 26 Nov 2015 14:43:20 -0700 Subject: [PATCH] Modified KolmogororSmirnovTest 2-sample test to use random jitter to break ties in input data. JIRA: MATH-1246. --- src/changes/changes.xml | 4 +- .../math4/random/JDKRandomGenerator.java | 17 ++++ .../stat/inference/KolmogorovSmirnovTest.java | 90 +++++++++++++++++- .../apache/commons/math4/util/MathArrays.java | 58 +++++++++++ .../inference/KolmogorovSmirnovTestTest.java | 79 +++++++++++++++ .../commons/math4/util/MathArraysTest.java | 95 ++++++++++++++++--- 6 files changed, 323 insertions(+), 20 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index fc75ddcd9..c5561617e 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -60,7 +60,9 @@ If the output is not quite correct, check for invisible trailing spaces! synchronization is now performed on the coefficient list instead of the class. - Added bootstrap method to 2-sample KolmogorovSmirnovTest. + Modified 2-sample KolmogorovSmirnovTest to handle ties in sample data. By default, + ties are broken by adding random jitter to input data. Also added bootstrap method + analogous to ks.boot in R Matching package. Added constructors taking sample data as arguments to enumerated real and integer distributions. diff --git a/src/main/java/org/apache/commons/math4/random/JDKRandomGenerator.java b/src/main/java/org/apache/commons/math4/random/JDKRandomGenerator.java index 29bc16678..279e073f4 100644 --- a/src/main/java/org/apache/commons/math4/random/JDKRandomGenerator.java +++ b/src/main/java/org/apache/commons/math4/random/JDKRandomGenerator.java @@ -29,6 +29,23 @@ public class JDKRandomGenerator extends Random implements RandomGenerator { /** Serializable version identifier. */ private static final long serialVersionUID = -7745277476784028798L; + /** + * Create a new JDKRandomGenerator with a default seed. + */ + public JDKRandomGenerator() { + super(); + } + + /** + * Create a new JDKRandomGenerator with the given seed. + * + * @param seed initial seed + * @since 3.6 + */ + public JDKRandomGenerator(int seed) { + setSeed(seed); + } + /** {@inheritDoc} */ @Override public void setSeed(int seed) { diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java index 38ff18566..9569caefb 100644 --- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java @@ -24,8 +24,10 @@ import java.util.Iterator; import org.apache.commons.math4.distribution.EnumeratedRealDistribution; import org.apache.commons.math4.distribution.RealDistribution; +import org.apache.commons.math4.distribution.UniformRealDistribution; import org.apache.commons.math4.exception.InsufficientDataException; import org.apache.commons.math4.exception.MathArithmeticException; +import org.apache.commons.math4.exception.MathInternalError; import org.apache.commons.math4.exception.NullArgumentException; import org.apache.commons.math4.exception.NumberIsTooLargeException; import org.apache.commons.math4.exception.OutOfRangeException; @@ -38,6 +40,7 @@ import org.apache.commons.math4.linear.Array2DRowFieldMatrix; import org.apache.commons.math4.linear.FieldMatrix; import org.apache.commons.math4.linear.MatrixUtils; import org.apache.commons.math4.linear.RealMatrix; +import org.apache.commons.math4.random.JDKRandomGenerator; import org.apache.commons.math4.random.RandomGenerator; import org.apache.commons.math4.random.Well19937c; import org.apache.commons.math4.util.CombinatoricsUtils; @@ -244,11 +247,21 @@ public class KolmogorovSmirnovTest { */ public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { final long lengthProduct = (long) x.length * y.length; + double[] xa = null; + double[] ya = null; + if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) { + xa = MathArrays.copyOf(x); + ya = MathArrays.copyOf(y); + fixTies(xa, ya); + } else { + xa = x; + ya = y; + } if (lengthProduct < SMALL_SAMPLE_PRODUCT) { - return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + (strict?1l:0l), x.length, y.length); + return integralExactP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length); } if (lengthProduct < LARGE_SAMPLE_PRODUCT) { - return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS); + return integralMonteCarloP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS); } return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); } @@ -1149,6 +1162,59 @@ public class KolmogorovSmirnovTest { return (double) tail / iterations; } + /** + * If there are no ties in the combined dataset formed from x and y, this + * method is a no-op. If there are ties, a uniform random deviate in + * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where + * minDelta is the minimum difference between unequal values in the combined + * sample. A fixed seed is used to generate the jitter, so repeated activations + * with the same input arrays result in the same values. + * + * NOTE: if there are ties in the data, this method overwrites the data in + * x and y with the jittered values. + * + * @param x first sample + * @param y second sample + */ + private static void fixTies(double[] x, double[] y) { + final double[] values = MathArrays.unique(MathArrays.concatenate(x,y)); + if (values.length == x.length + y.length) { + return; // There are no ties + } + + // Find the smallest difference between values, or 1 if all values are the same + double minDelta = 1; + double prev = values[0]; + double delta = 1; + for (int i = 1; i < values.length; i++) { + delta = prev - values[i]; + if (delta < minDelta) { + minDelta = delta; + } + prev = values[i]; + } + minDelta /= 2; + + // Add jitter using a fixed seed (so same arguments always give same results), + // low-initialization-overhead generator + final RealDistribution dist = + new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta); + + // It is theoretically possible that jitter does not break ties, so repeat + // until all ties are gone. Bound the loop and throw MIE if bound is exceeded. + int ct = 0; + boolean ties = true; + do { + jitter(x, dist); + jitter(y, dist); + ties = hasTies(x, y); + ct++; + } while (ties && ct < 1000); + if (ties) { + throw new MathInternalError(); // Should never happen + } + } + /** * Returns true iff there are ties in the combined sample * formed from x and y. @@ -1157,8 +1223,8 @@ public class KolmogorovSmirnovTest { * @param y second sample * @return true if x and y together contain ties */ - private boolean hasTies(double[] x, double[] y) { - HashSet values = new HashSet(); + private static boolean hasTies(double[] x, double[] y) { + final HashSet values = new HashSet(); for (int i = 0; i < x.length; i++) { if (!values.add(x[i])) { return true; @@ -1171,4 +1237,20 @@ public class KolmogorovSmirnovTest { } return false; } + + /** + * Adds random jitter to {@code data} using deviates sampled from {@code dist}. + *

+ * Note that jitter is applied in-place - i.e., the array + * values are overwritten with the result of applying jitter.

+ * + * @param data input/output data array - entries overwritten by the method + * @param dist probability distribution to sample for jitter values + * @throws NullPointerException if either of the parameters is null + */ + private static void jitter(double[] data, RealDistribution dist) { + for (int i = 0; i < data.length; i++) { + data[i] = data[i] + dist.sample(); + } + } } diff --git a/src/main/java/org/apache/commons/math4/util/MathArrays.java b/src/main/java/org/apache/commons/math4/util/MathArrays.java index 71c6d3125..c50f44c25 100644 --- a/src/main/java/org/apache/commons/math4/util/MathArrays.java +++ b/src/main/java/org/apache/commons/math4/util/MathArrays.java @@ -22,7 +22,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.Iterator; import java.util.List; +import java.util.TreeSet; import org.apache.commons.math4.Field; import org.apache.commons.math4.distribution.UniformIntegerDistribution; @@ -1876,4 +1878,60 @@ public class MathArrays { return verifyValues(values, begin, length, allowEmpty); } + + /** + * Concatenates a sequence of arrays. The return array consists of the + * entries of the input arrays concatenated in the order they appear in + * the argument list. Null arrays cause NullPointerExceptions; zero + * length arrays are allowed (contributing nothing to the output array). + * + * @param x list of double[] arrays to concatenate + * @return a new array consisting of the entries of the argument arrays + * @throws NullPointerException if any of the arrays are null + * @since 3.6 + */ + public static double[] concatenate(double[] ...x) { + int combinedLength = 0; + for (double[] a : x) { + combinedLength += a.length; + } + int offset = 0; + int curLength = 0; + final double[] combined = new double[combinedLength]; + for (int i = 0; i < x.length; i++) { + curLength = x[i].length; + System.arraycopy(x[i], 0, combined, offset, curLength); + offset += curLength; + } + return combined; + } + + /** + * Returns an array consisting of the unique values in {@code data}. + * The return array is sorted in descending order. Empty arrays + * are allowed, but null arrays result in NullPointerException. + * Infinities are allowed. NaN values are allowed with maximum + * sort order - i.e., if there are NaN values in {@code data}, + * {@code Double.NaN} will be the first element of the output array, + * even if the array also contains {@code Double.POSITIVE_INFINITY}. + * + * @param data array to scan + * @return descending list of values included in the input array + * @throws NullPointerException if data is null + * @since 3.6 + */ + public static double[] unique(double[] data) { + TreeSet values = new TreeSet(); + for (int i = 0; i < data.length; i++) { + values.add(data[i]); + } + final int count = values.size(); + final double[] out = new double[count]; + Iterator iterator = values.descendingIterator(); + int i = 0; + while (iterator.hasNext()) { + out[i++] = iterator.next(); + } + return out; + } } diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java index bd1b1cb8a..70b861353 100644 --- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java @@ -17,6 +17,7 @@ package org.apache.commons.math4.stat.inference; +import java.lang.reflect.Method; import java.util.Arrays; import org.apache.commons.math4.TestUtils; @@ -27,6 +28,7 @@ import org.apache.commons.math4.random.Well19937c; import org.apache.commons.math4.stat.inference.KolmogorovSmirnovTest; import org.apache.commons.math4.util.CombinatoricsUtils; import org.apache.commons.math4.util.FastMath; +import org.apache.commons.math4.util.MathArrays; import org.junit.Assert; import org.junit.Test; @@ -567,6 +569,63 @@ public class KolmogorovSmirnovTestTest { Assert.assertEquals(0.06303, test.bootstrap(x, y, 10000, false), 1E-2); } + @Test + public void testFixTiesNoOp() throws Exception { + final double[] x = {0, 1, 2, 3, 4}; + final double[] y = {5, 6, 7, 8}; + final double[] origX = MathArrays.copyOf(x); + final double[] origY = MathArrays.copyOf(y); + fixTies(x,y); + Assert.assertArrayEquals(origX, x, 0); + Assert.assertArrayEquals(origY, y, 0); + } + + /** + * Verify that fixTies is deterministic, i.e, + * x = x', y = y' => fixTies(x,y) = fixTies(x', y') + */ + @Test + public void testFixTiesConsistency() throws Exception { + final double[] x = {0, 1, 2, 3, 4, 2}; + final double[] y = {5, 6, 7, 8, 1, 2}; + final double[] xP = MathArrays.copyOf(x); + final double[] yP = MathArrays.copyOf(y); + checkFixTies(x, y); + final double[] fixedX = MathArrays.copyOf(x); + final double[] fixedY = MathArrays.copyOf(y); + checkFixTies(xP, yP); + Assert.assertArrayEquals(fixedX, xP, 0); + Assert.assertArrayEquals(fixedY, yP, 0); + } + + @Test + public void testFixTies() throws Exception { + checkFixTies(new double[] {0, 1, 1, 4, 0}, new double[] {0, 5, 0.5, 0.55, 7}); + checkFixTies(new double[] {1, 1, 1, 1, 1}, new double[] {1, 1}); + checkFixTies(new double[] {1, 2, 3}, new double[] {1}); + checkFixTies(new double[] {1, 1, 0, 1, 0}, new double[] {}); + } + + /** + * Checks that fixTies eliminates ties in the data but does not otherwise + * perturb the ordering. + */ + private void checkFixTies(double[] x, double[] y) throws Exception { + final double[] origCombined = MathArrays.concatenate(x, y); + fixTies(x, y); + Assert.assertFalse(hasTies(x, y)); + final double[] combined = MathArrays.concatenate(x, y); + for (int i = 0; i < combined.length; i++) { + for (int j = 0; j < i; j++) { + Assert.assertTrue(combined[i] != combined[j]); + if (combined[i] < combined[j]) + Assert.assertTrue(origCombined[i] < origCombined[j] + || origCombined[i] == origCombined[j]); + } + + } + } + /** * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n, * m, false). @@ -601,4 +660,24 @@ public class KolmogorovSmirnovTestTest { Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon); } + /** + * Reflection hack to expose private fixTies method for testing. + */ + private static void fixTies(double[] x, double[] y) throws Exception { + Method method = KolmogorovSmirnovTest.class.getDeclaredMethod("fixTies", + double[].class, double[].class); + method.setAccessible(true); + method.invoke(KolmogorovSmirnovTest.class, x, y); + } + + /** + * Reflection hack to expose private hasTies method. + */ + private static boolean hasTies(double[] x, double[] y) throws Exception { + Method method = KolmogorovSmirnovTest.class.getDeclaredMethod("hasTies", + double[].class, double[].class); + method.setAccessible(true); + return (boolean) method.invoke(KolmogorovSmirnovTest.class, x, y); + } + } diff --git a/src/test/java/org/apache/commons/math4/util/MathArraysTest.java b/src/test/java/org/apache/commons/math4/util/MathArraysTest.java index 0a7c30fe4..a6cc7649e 100644 --- a/src/test/java/org/apache/commons/math4/util/MathArraysTest.java +++ b/src/test/java/org/apache/commons/math4/util/MathArraysTest.java @@ -37,7 +37,7 @@ import org.junit.Test; * */ public class MathArraysTest { - + private double[] testArray = {0, 1, 2, 3, 4, 5}; private double[] testWeightsArray = {0.3, 0.2, 1.3, 1.1, 1.0, 1.8}; private double[] testNegativeWeightsArray = {-0.3, 0.2, -1.3, 1.1, 1.0, 1.8}; @@ -49,7 +49,7 @@ public class MathArraysTest { final double[] test = new double[] { -2.5, -1, 0, 1, 2.5 }; final double[] correctTest = MathArrays.copyOf(test); final double[] correctScaled = new double[]{5.25, 2.1, 0, -2.1, -5.25}; - + final double[] scaled = MathArrays.scale(-2.1, test); // Make sure test has not changed @@ -62,7 +62,7 @@ public class MathArraysTest { Assert.assertEquals(correctScaled[i], scaled[i], 0); } } - + @Test public void testScaleInPlace() { final double[] test = new double[] { -2.5, -1, 0, 1, 2.5 }; @@ -74,7 +74,7 @@ public class MathArraysTest { Assert.assertEquals(correctScaled[i], test[i], 0); } } - + @Test(expected=DimensionMismatchException.class) public void testEbeAddPrecondition() { MathArrays.ebeAdd(new double[3], new double[4]); @@ -353,7 +353,7 @@ public class MathArraysTest { new Double(-27.5) }, MathArrays.OrderDirection.DECREASING, false)); } - + @Test public void testCheckRectangular() { final long[][] rect = new long[][] {{0, 1}, {2, 3}}; @@ -373,9 +373,9 @@ public class MathArraysTest { Assert.fail("Expecting NullArgumentException"); } catch (NullArgumentException ex) { // Expected - } + } } - + @Test public void testCheckPositive() { final double[] positive = new double[] {1, 2, 3}; @@ -397,7 +397,7 @@ public class MathArraysTest { // Expected } } - + @Test public void testCheckNonNegative() { final long[] nonNegative = new long[] {0, 1}; @@ -419,7 +419,7 @@ public class MathArraysTest { // Expected } } - + @Test public void testCheckNonNegative2D() { final long[][] nonNegative = new long[][] {{0, 1}, {1, 0}}; @@ -554,7 +554,7 @@ public class MathArraysTest { Assert.assertEquals(25, x2[0], FastMath.ulp(1d)); Assert.assertEquals(125, x3[0], FastMath.ulp(1d)); } - + @Test /** Example in javadoc */ public void testSortInPlaceExample() { @@ -569,7 +569,7 @@ public class MathArraysTest { Assert.assertTrue(Arrays.equals(sy, y)); Assert.assertTrue(Arrays.equals(sz, z)); } - + @Test public void testSortInPlaceFailures() { final double[] nullArray = null; @@ -1047,7 +1047,7 @@ public class MathArraysTest { Assert.fail("expecting MathIllegalArgumentException"); } catch (MathIllegalArgumentException ex) {} } - + @Test public void testConvolve() { /* Test Case (obtained via SciPy) @@ -1066,10 +1066,10 @@ public class MathArraysTest { double[] x2 = { 1, 2, 3 }; double[] h2 = { 0, 1, 0.5 }; double[] y2 = { 0, 1, 2.5, 4, 1.5 }; - + yActual = MathArrays.convolve(x2, h2); Assert.assertArrayEquals(y2, yActual, tolerance); - + try { MathArrays.convolve(new double[]{1, 2}, null); Assert.fail("an exception should have been thrown"); @@ -1187,7 +1187,7 @@ public class MathArraysTest { final int[] seq = MathArrays.sequence(0, 12345, 6789); Assert.assertEquals(0, seq.length); } - + @Test public void testVerifyValuesPositive() { for (int j = 0; j < 6; j++) { @@ -1252,4 +1252,69 @@ public class MathArraysTest { // expected } } + + @Test + public void testConcatenate() { + final double[] u = new double[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + final double[] x = new double[] {0, 1, 2}; + final double[] y = new double[] {3, 4, 5, 6, 7, 8}; + final double[] z = new double[] {9}; + Assert.assertArrayEquals(u, MathArrays.concatenate(x, y, z), 0); + } + + @Test + public void testConcatentateSingle() { + final double[] x = new double[] {0, 1, 2}; + Assert.assertArrayEquals(x, MathArrays.concatenate(x), 0); + } + + public void testConcatenateEmptyArguments() { + final double[] x = new double[] {0, 1, 2}; + final double[] y = new double[] {3}; + final double[] z = new double[] {}; + final double[] u = new double[] {0, 1, 2, 3}; + Assert.assertArrayEquals(u, MathArrays.concatenate(x, z, y), 0); + Assert.assertArrayEquals(u, MathArrays.concatenate(x, y, z), 0); + Assert.assertArrayEquals(u, MathArrays.concatenate(z, x, y), 0); + Assert.assertEquals(0, MathArrays.concatenate(z, z, z).length); + } + + @Test(expected=NullPointerException.class) + public void testConcatenateNullArguments() { + final double[] x = new double[] {0, 1, 2}; + MathArrays.concatenate(x, null); + } + + @Test + public void testUnique() { + final double[] x = {0, 9, 3, 0, 11, 7, 3, 5, -1, -2}; + final double[] values = {11, 9, 7, 5, 3, 0, -1, -2}; + Assert.assertArrayEquals(values, MathArrays.unique(x), 0); + } + + @Test + public void testUniqueInfiniteValues() { + final double [] x = {0, Double.NEGATIVE_INFINITY, 3, Double.NEGATIVE_INFINITY, + 3, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY}; + final double[] u = {Double.POSITIVE_INFINITY, 3, 0, Double.NEGATIVE_INFINITY}; + Assert.assertArrayEquals(u , MathArrays.unique(x), 0); + } + + @Test + public void testUniqueNaNValues() { + final double[] x = new double[] {10, 2, Double.NaN, Double.NaN, Double.NaN, + Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY}; + final double[] u = MathArrays.unique(x); + Assert.assertEquals(5, u.length); + Assert.assertTrue(Double.isNaN(u[0])); + Assert.assertEquals(Double.POSITIVE_INFINITY, u[1], 0); + Assert.assertEquals(10, u[2], 0); + Assert.assertEquals(2, u[3], 0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, u[4], 0); + } + + @Test(expected=NullPointerException.class) + public void testUniqueNullArgument() { + MathArrays.unique(null); + } }