Modified KolmogororSmirnovTest 2-sample test to use random jitter to break ties in input data. JIRA: MATH-1246.

This commit is contained in:
Phil Steitz 2015-11-26 14:43:20 -07:00
parent 5f9cfa6ebf
commit f7ab3a70ec
6 changed files with 323 additions and 20 deletions

View File

@ -60,7 +60,9 @@ If the output is not quite correct, check for invisible trailing spaces!
synchronization is now performed on the coefficient list instead of the class. synchronization is now performed on the coefficient list instead of the class.
</action> </action>
<action dev="psteitz" type="update" issue="MATH-1246"> <action dev="psteitz" type="update" issue="MATH-1246">
Added bootstrap method to 2-sample KolmogorovSmirnovTest. Modified 2-sample KolmogorovSmirnovTest to handle ties in sample data. By default,
ties are broken by adding random jitter to input data. Also added bootstrap method
analogous to ks.boot in R Matching package.
</action> </action>
<action dev="psteitz" type="update" issue="MATH-1287"> <!-- backported to 3.6 --> <action dev="psteitz" type="update" issue="MATH-1287"> <!-- backported to 3.6 -->
Added constructors taking sample data as arguments to enumerated real and integer distributions. Added constructors taking sample data as arguments to enumerated real and integer distributions.

View File

@ -29,6 +29,23 @@ public class JDKRandomGenerator extends Random implements RandomGenerator {
/** Serializable version identifier. */ /** Serializable version identifier. */
private static final long serialVersionUID = -7745277476784028798L; private static final long serialVersionUID = -7745277476784028798L;
/**
* Create a new JDKRandomGenerator with a default seed.
*/
public JDKRandomGenerator() {
super();
}
/**
* Create a new JDKRandomGenerator with the given seed.
*
* @param seed initial seed
* @since 3.6
*/
public JDKRandomGenerator(int seed) {
setSeed(seed);
}
/** {@inheritDoc} */ /** {@inheritDoc} */
@Override @Override
public void setSeed(int seed) { public void setSeed(int seed) {

View File

@ -24,8 +24,10 @@ import java.util.Iterator;
import org.apache.commons.math4.distribution.EnumeratedRealDistribution; import org.apache.commons.math4.distribution.EnumeratedRealDistribution;
import org.apache.commons.math4.distribution.RealDistribution; import org.apache.commons.math4.distribution.RealDistribution;
import org.apache.commons.math4.distribution.UniformRealDistribution;
import org.apache.commons.math4.exception.InsufficientDataException; import org.apache.commons.math4.exception.InsufficientDataException;
import org.apache.commons.math4.exception.MathArithmeticException; import org.apache.commons.math4.exception.MathArithmeticException;
import org.apache.commons.math4.exception.MathInternalError;
import org.apache.commons.math4.exception.NullArgumentException; import org.apache.commons.math4.exception.NullArgumentException;
import org.apache.commons.math4.exception.NumberIsTooLargeException; import org.apache.commons.math4.exception.NumberIsTooLargeException;
import org.apache.commons.math4.exception.OutOfRangeException; import org.apache.commons.math4.exception.OutOfRangeException;
@ -38,6 +40,7 @@ import org.apache.commons.math4.linear.Array2DRowFieldMatrix;
import org.apache.commons.math4.linear.FieldMatrix; import org.apache.commons.math4.linear.FieldMatrix;
import org.apache.commons.math4.linear.MatrixUtils; import org.apache.commons.math4.linear.MatrixUtils;
import org.apache.commons.math4.linear.RealMatrix; import org.apache.commons.math4.linear.RealMatrix;
import org.apache.commons.math4.random.JDKRandomGenerator;
import org.apache.commons.math4.random.RandomGenerator; import org.apache.commons.math4.random.RandomGenerator;
import org.apache.commons.math4.random.Well19937c; import org.apache.commons.math4.random.Well19937c;
import org.apache.commons.math4.util.CombinatoricsUtils; import org.apache.commons.math4.util.CombinatoricsUtils;
@ -244,11 +247,21 @@ public class KolmogorovSmirnovTest {
*/ */
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
final long lengthProduct = (long) x.length * y.length; final long lengthProduct = (long) x.length * y.length;
double[] xa = null;
double[] ya = null;
if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
xa = MathArrays.copyOf(x);
ya = MathArrays.copyOf(y);
fixTies(xa, ya);
} else {
xa = x;
ya = y;
}
if (lengthProduct < SMALL_SAMPLE_PRODUCT) { if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + (strict?1l:0l), x.length, y.length); return integralExactP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length);
} }
if (lengthProduct < LARGE_SAMPLE_PRODUCT) { if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS); return integralMonteCarloP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
} }
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
} }
@ -1149,6 +1162,59 @@ public class KolmogorovSmirnovTest {
return (double) tail / iterations; return (double) tail / iterations;
} }
/**
* If there are no ties in the combined dataset formed from x and y, this
* method is a no-op. If there are ties, a uniform random deviate in
* (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
* minDelta is the minimum difference between unequal values in the combined
* sample. A fixed seed is used to generate the jitter, so repeated activations
* with the same input arrays result in the same values.
*
* NOTE: if there are ties in the data, this method overwrites the data in
* x and y with the jittered values.
*
* @param x first sample
* @param y second sample
*/
private static void fixTies(double[] x, double[] y) {
final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
if (values.length == x.length + y.length) {
return; // There are no ties
}
// Find the smallest difference between values, or 1 if all values are the same
double minDelta = 1;
double prev = values[0];
double delta = 1;
for (int i = 1; i < values.length; i++) {
delta = prev - values[i];
if (delta < minDelta) {
minDelta = delta;
}
prev = values[i];
}
minDelta /= 2;
// Add jitter using a fixed seed (so same arguments always give same results),
// low-initialization-overhead generator
final RealDistribution dist =
new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta);
// It is theoretically possible that jitter does not break ties, so repeat
// until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
int ct = 0;
boolean ties = true;
do {
jitter(x, dist);
jitter(y, dist);
ties = hasTies(x, y);
ct++;
} while (ties && ct < 1000);
if (ties) {
throw new MathInternalError(); // Should never happen
}
}
/** /**
* Returns true iff there are ties in the combined sample * Returns true iff there are ties in the combined sample
* formed from x and y. * formed from x and y.
@ -1157,8 +1223,8 @@ public class KolmogorovSmirnovTest {
* @param y second sample * @param y second sample
* @return true if x and y together contain ties * @return true if x and y together contain ties
*/ */
private boolean hasTies(double[] x, double[] y) { private static boolean hasTies(double[] x, double[] y) {
HashSet<Double> values = new HashSet<Double>(); final HashSet<Double> values = new HashSet<Double>();
for (int i = 0; i < x.length; i++) { for (int i = 0; i < x.length; i++) {
if (!values.add(x[i])) { if (!values.add(x[i])) {
return true; return true;
@ -1171,4 +1237,20 @@ public class KolmogorovSmirnovTest {
} }
return false; return false;
} }
/**
* Adds random jitter to {@code data} using deviates sampled from {@code dist}.
* <p>
* Note that jitter is applied in-place - i.e., the array
* values are overwritten with the result of applying jitter.</p>
*
* @param data input/output data array - entries overwritten by the method
* @param dist probability distribution to sample for jitter values
* @throws NullPointerException if either of the parameters is null
*/
private static void jitter(double[] data, RealDistribution dist) {
for (int i = 0; i < data.length; i++) {
data[i] = data[i] + dist.sample();
}
}
} }

View File

@ -22,7 +22,9 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.TreeSet;
import org.apache.commons.math4.Field; import org.apache.commons.math4.Field;
import org.apache.commons.math4.distribution.UniformIntegerDistribution; import org.apache.commons.math4.distribution.UniformIntegerDistribution;
@ -1876,4 +1878,60 @@ public class MathArrays {
return verifyValues(values, begin, length, allowEmpty); return verifyValues(values, begin, length, allowEmpty);
} }
/**
* Concatenates a sequence of arrays. The return array consists of the
* entries of the input arrays concatenated in the order they appear in
* the argument list. Null arrays cause NullPointerExceptions; zero
* length arrays are allowed (contributing nothing to the output array).
*
* @param x list of double[] arrays to concatenate
* @return a new array consisting of the entries of the argument arrays
* @throws NullPointerException if any of the arrays are null
* @since 3.6
*/
public static double[] concatenate(double[] ...x) {
int combinedLength = 0;
for (double[] a : x) {
combinedLength += a.length;
}
int offset = 0;
int curLength = 0;
final double[] combined = new double[combinedLength];
for (int i = 0; i < x.length; i++) {
curLength = x[i].length;
System.arraycopy(x[i], 0, combined, offset, curLength);
offset += curLength;
}
return combined;
}
/**
* Returns an array consisting of the unique values in {@code data}.
* The return array is sorted in descending order. Empty arrays
* are allowed, but null arrays result in NullPointerException.
* Infinities are allowed. NaN values are allowed with maximum
* sort order - i.e., if there are NaN values in {@code data},
* {@code Double.NaN} will be the first element of the output array,
* even if the array also contains {@code Double.POSITIVE_INFINITY}.
*
* @param data array to scan
* @return descending list of values included in the input array
* @throws NullPointerException if data is null
* @since 3.6
*/
public static double[] unique(double[] data) {
TreeSet<Double> values = new TreeSet<Double>();
for (int i = 0; i < data.length; i++) {
values.add(data[i]);
}
final int count = values.size();
final double[] out = new double[count];
Iterator<Double> iterator = values.descendingIterator();
int i = 0;
while (iterator.hasNext()) {
out[i++] = iterator.next();
}
return out;
}
} }

View File

@ -17,6 +17,7 @@
package org.apache.commons.math4.stat.inference; package org.apache.commons.math4.stat.inference;
import java.lang.reflect.Method;
import java.util.Arrays; import java.util.Arrays;
import org.apache.commons.math4.TestUtils; import org.apache.commons.math4.TestUtils;
@ -27,6 +28,7 @@ import org.apache.commons.math4.random.Well19937c;
import org.apache.commons.math4.stat.inference.KolmogorovSmirnovTest; import org.apache.commons.math4.stat.inference.KolmogorovSmirnovTest;
import org.apache.commons.math4.util.CombinatoricsUtils; import org.apache.commons.math4.util.CombinatoricsUtils;
import org.apache.commons.math4.util.FastMath; import org.apache.commons.math4.util.FastMath;
import org.apache.commons.math4.util.MathArrays;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
@ -567,6 +569,63 @@ public class KolmogorovSmirnovTestTest {
Assert.assertEquals(0.06303, test.bootstrap(x, y, 10000, false), 1E-2); Assert.assertEquals(0.06303, test.bootstrap(x, y, 10000, false), 1E-2);
} }
@Test
public void testFixTiesNoOp() throws Exception {
final double[] x = {0, 1, 2, 3, 4};
final double[] y = {5, 6, 7, 8};
final double[] origX = MathArrays.copyOf(x);
final double[] origY = MathArrays.copyOf(y);
fixTies(x,y);
Assert.assertArrayEquals(origX, x, 0);
Assert.assertArrayEquals(origY, y, 0);
}
/**
* Verify that fixTies is deterministic, i.e,
* x = x', y = y' => fixTies(x,y) = fixTies(x', y')
*/
@Test
public void testFixTiesConsistency() throws Exception {
final double[] x = {0, 1, 2, 3, 4, 2};
final double[] y = {5, 6, 7, 8, 1, 2};
final double[] xP = MathArrays.copyOf(x);
final double[] yP = MathArrays.copyOf(y);
checkFixTies(x, y);
final double[] fixedX = MathArrays.copyOf(x);
final double[] fixedY = MathArrays.copyOf(y);
checkFixTies(xP, yP);
Assert.assertArrayEquals(fixedX, xP, 0);
Assert.assertArrayEquals(fixedY, yP, 0);
}
@Test
public void testFixTies() throws Exception {
checkFixTies(new double[] {0, 1, 1, 4, 0}, new double[] {0, 5, 0.5, 0.55, 7});
checkFixTies(new double[] {1, 1, 1, 1, 1}, new double[] {1, 1});
checkFixTies(new double[] {1, 2, 3}, new double[] {1});
checkFixTies(new double[] {1, 1, 0, 1, 0}, new double[] {});
}
/**
* Checks that fixTies eliminates ties in the data but does not otherwise
* perturb the ordering.
*/
private void checkFixTies(double[] x, double[] y) throws Exception {
final double[] origCombined = MathArrays.concatenate(x, y);
fixTies(x, y);
Assert.assertFalse(hasTies(x, y));
final double[] combined = MathArrays.concatenate(x, y);
for (int i = 0; i < combined.length; i++) {
for (int j = 0; j < i; j++) {
Assert.assertTrue(combined[i] != combined[j]);
if (combined[i] < combined[j])
Assert.assertTrue(origCombined[i] < origCombined[j]
|| origCombined[i] == origCombined[j]);
}
}
}
/** /**
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n, * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
* m, false). * m, false).
@ -601,4 +660,24 @@ public class KolmogorovSmirnovTestTest {
Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon); Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon);
} }
/**
* Reflection hack to expose private fixTies method for testing.
*/
private static void fixTies(double[] x, double[] y) throws Exception {
Method method = KolmogorovSmirnovTest.class.getDeclaredMethod("fixTies",
double[].class, double[].class);
method.setAccessible(true);
method.invoke(KolmogorovSmirnovTest.class, x, y);
}
/**
* Reflection hack to expose private hasTies method.
*/
private static boolean hasTies(double[] x, double[] y) throws Exception {
Method method = KolmogorovSmirnovTest.class.getDeclaredMethod("hasTies",
double[].class, double[].class);
method.setAccessible(true);
return (boolean) method.invoke(KolmogorovSmirnovTest.class, x, y);
}
} }

View File

@ -1252,4 +1252,69 @@ public class MathArraysTest {
// expected // expected
} }
} }
@Test
public void testConcatenate() {
final double[] u = new double[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
final double[] x = new double[] {0, 1, 2};
final double[] y = new double[] {3, 4, 5, 6, 7, 8};
final double[] z = new double[] {9};
Assert.assertArrayEquals(u, MathArrays.concatenate(x, y, z), 0);
}
@Test
public void testConcatentateSingle() {
final double[] x = new double[] {0, 1, 2};
Assert.assertArrayEquals(x, MathArrays.concatenate(x), 0);
}
public void testConcatenateEmptyArguments() {
final double[] x = new double[] {0, 1, 2};
final double[] y = new double[] {3};
final double[] z = new double[] {};
final double[] u = new double[] {0, 1, 2, 3};
Assert.assertArrayEquals(u, MathArrays.concatenate(x, z, y), 0);
Assert.assertArrayEquals(u, MathArrays.concatenate(x, y, z), 0);
Assert.assertArrayEquals(u, MathArrays.concatenate(z, x, y), 0);
Assert.assertEquals(0, MathArrays.concatenate(z, z, z).length);
}
@Test(expected=NullPointerException.class)
public void testConcatenateNullArguments() {
final double[] x = new double[] {0, 1, 2};
MathArrays.concatenate(x, null);
}
@Test
public void testUnique() {
final double[] x = {0, 9, 3, 0, 11, 7, 3, 5, -1, -2};
final double[] values = {11, 9, 7, 5, 3, 0, -1, -2};
Assert.assertArrayEquals(values, MathArrays.unique(x), 0);
}
@Test
public void testUniqueInfiniteValues() {
final double [] x = {0, Double.NEGATIVE_INFINITY, 3, Double.NEGATIVE_INFINITY,
3, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY};
final double[] u = {Double.POSITIVE_INFINITY, 3, 0, Double.NEGATIVE_INFINITY};
Assert.assertArrayEquals(u , MathArrays.unique(x), 0);
}
@Test
public void testUniqueNaNValues() {
final double[] x = new double[] {10, 2, Double.NaN, Double.NaN, Double.NaN,
Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY};
final double[] u = MathArrays.unique(x);
Assert.assertEquals(5, u.length);
Assert.assertTrue(Double.isNaN(u[0]));
Assert.assertEquals(Double.POSITIVE_INFINITY, u[1], 0);
Assert.assertEquals(10, u[2], 0);
Assert.assertEquals(2, u[3], 0);
Assert.assertEquals(Double.NEGATIVE_INFINITY, u[4], 0);
}
@Test(expected=NullPointerException.class)
public void testUniqueNullArgument() {
MathArrays.unique(null);
}
} }