Modified KolmogororSmirnovTest 2-sample test to use random jitter to break ties in input data. JIRA: MATH-1246.

This commit is contained in:
Phil Steitz 2015-11-26 14:43:20 -07:00
parent 5f9cfa6ebf
commit f7ab3a70ec
6 changed files with 323 additions and 20 deletions

View File

@ -60,7 +60,9 @@ If the output is not quite correct, check for invisible trailing spaces!
synchronization is now performed on the coefficient list instead of the class.
</action>
<action dev="psteitz" type="update" issue="MATH-1246">
Added bootstrap method to 2-sample KolmogorovSmirnovTest.
Modified 2-sample KolmogorovSmirnovTest to handle ties in sample data. By default,
ties are broken by adding random jitter to input data. Also added bootstrap method
analogous to ks.boot in R Matching package.
</action>
<action dev="psteitz" type="update" issue="MATH-1287"> <!-- backported to 3.6 -->
Added constructors taking sample data as arguments to enumerated real and integer distributions.

View File

@ -29,6 +29,23 @@ public class JDKRandomGenerator extends Random implements RandomGenerator {
/** Serializable version identifier. */
private static final long serialVersionUID = -7745277476784028798L;
/**
* Create a new JDKRandomGenerator with a default seed.
*/
public JDKRandomGenerator() {
super();
}
/**
* Create a new JDKRandomGenerator with the given seed.
*
* @param seed initial seed
* @since 3.6
*/
public JDKRandomGenerator(int seed) {
setSeed(seed);
}
/** {@inheritDoc} */
@Override
public void setSeed(int seed) {

View File

@ -24,8 +24,10 @@ import java.util.Iterator;
import org.apache.commons.math4.distribution.EnumeratedRealDistribution;
import org.apache.commons.math4.distribution.RealDistribution;
import org.apache.commons.math4.distribution.UniformRealDistribution;
import org.apache.commons.math4.exception.InsufficientDataException;
import org.apache.commons.math4.exception.MathArithmeticException;
import org.apache.commons.math4.exception.MathInternalError;
import org.apache.commons.math4.exception.NullArgumentException;
import org.apache.commons.math4.exception.NumberIsTooLargeException;
import org.apache.commons.math4.exception.OutOfRangeException;
@ -38,6 +40,7 @@ import org.apache.commons.math4.linear.Array2DRowFieldMatrix;
import org.apache.commons.math4.linear.FieldMatrix;
import org.apache.commons.math4.linear.MatrixUtils;
import org.apache.commons.math4.linear.RealMatrix;
import org.apache.commons.math4.random.JDKRandomGenerator;
import org.apache.commons.math4.random.RandomGenerator;
import org.apache.commons.math4.random.Well19937c;
import org.apache.commons.math4.util.CombinatoricsUtils;
@ -244,11 +247,21 @@ public class KolmogorovSmirnovTest {
*/
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
final long lengthProduct = (long) x.length * y.length;
double[] xa = null;
double[] ya = null;
if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
xa = MathArrays.copyOf(x);
ya = MathArrays.copyOf(y);
fixTies(xa, ya);
} else {
xa = x;
ya = y;
}
if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + (strict?1l:0l), x.length, y.length);
return integralExactP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length);
}
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
return integralMonteCarloP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
}
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
}
@ -1149,6 +1162,59 @@ public class KolmogorovSmirnovTest {
return (double) tail / iterations;
}
/**
* If there are no ties in the combined dataset formed from x and y, this
* method is a no-op. If there are ties, a uniform random deviate in
* (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
* minDelta is the minimum difference between unequal values in the combined
* sample. A fixed seed is used to generate the jitter, so repeated activations
* with the same input arrays result in the same values.
*
* NOTE: if there are ties in the data, this method overwrites the data in
* x and y with the jittered values.
*
* @param x first sample
* @param y second sample
*/
private static void fixTies(double[] x, double[] y) {
final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
if (values.length == x.length + y.length) {
return; // There are no ties
}
// Find the smallest difference between values, or 1 if all values are the same
double minDelta = 1;
double prev = values[0];
double delta = 1;
for (int i = 1; i < values.length; i++) {
delta = prev - values[i];
if (delta < minDelta) {
minDelta = delta;
}
prev = values[i];
}
minDelta /= 2;
// Add jitter using a fixed seed (so same arguments always give same results),
// low-initialization-overhead generator
final RealDistribution dist =
new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta);
// It is theoretically possible that jitter does not break ties, so repeat
// until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
int ct = 0;
boolean ties = true;
do {
jitter(x, dist);
jitter(y, dist);
ties = hasTies(x, y);
ct++;
} while (ties && ct < 1000);
if (ties) {
throw new MathInternalError(); // Should never happen
}
}
/**
* Returns true iff there are ties in the combined sample
* formed from x and y.
@ -1157,8 +1223,8 @@ public class KolmogorovSmirnovTest {
* @param y second sample
* @return true if x and y together contain ties
*/
private boolean hasTies(double[] x, double[] y) {
HashSet<Double> values = new HashSet<Double>();
private static boolean hasTies(double[] x, double[] y) {
final HashSet<Double> values = new HashSet<Double>();
for (int i = 0; i < x.length; i++) {
if (!values.add(x[i])) {
return true;
@ -1171,4 +1237,20 @@ public class KolmogorovSmirnovTest {
}
return false;
}
/**
* Adds random jitter to {@code data} using deviates sampled from {@code dist}.
* <p>
* Note that jitter is applied in-place - i.e., the array
* values are overwritten with the result of applying jitter.</p>
*
* @param data input/output data array - entries overwritten by the method
* @param dist probability distribution to sample for jitter values
* @throws NullPointerException if either of the parameters is null
*/
private static void jitter(double[] data, RealDistribution dist) {
for (int i = 0; i < data.length; i++) {
data[i] = data[i] + dist.sample();
}
}
}

View File

@ -22,7 +22,9 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import org.apache.commons.math4.Field;
import org.apache.commons.math4.distribution.UniformIntegerDistribution;
@ -1876,4 +1878,60 @@ public class MathArrays {
return verifyValues(values, begin, length, allowEmpty);
}
/**
* Concatenates a sequence of arrays. The return array consists of the
* entries of the input arrays concatenated in the order they appear in
* the argument list. Null arrays cause NullPointerExceptions; zero
* length arrays are allowed (contributing nothing to the output array).
*
* @param x list of double[] arrays to concatenate
* @return a new array consisting of the entries of the argument arrays
* @throws NullPointerException if any of the arrays are null
* @since 3.6
*/
public static double[] concatenate(double[] ...x) {
int combinedLength = 0;
for (double[] a : x) {
combinedLength += a.length;
}
int offset = 0;
int curLength = 0;
final double[] combined = new double[combinedLength];
for (int i = 0; i < x.length; i++) {
curLength = x[i].length;
System.arraycopy(x[i], 0, combined, offset, curLength);
offset += curLength;
}
return combined;
}
/**
* Returns an array consisting of the unique values in {@code data}.
* The return array is sorted in descending order. Empty arrays
* are allowed, but null arrays result in NullPointerException.
* Infinities are allowed. NaN values are allowed with maximum
* sort order - i.e., if there are NaN values in {@code data},
* {@code Double.NaN} will be the first element of the output array,
* even if the array also contains {@code Double.POSITIVE_INFINITY}.
*
* @param data array to scan
* @return descending list of values included in the input array
* @throws NullPointerException if data is null
* @since 3.6
*/
public static double[] unique(double[] data) {
TreeSet<Double> values = new TreeSet<Double>();
for (int i = 0; i < data.length; i++) {
values.add(data[i]);
}
final int count = values.size();
final double[] out = new double[count];
Iterator<Double> iterator = values.descendingIterator();
int i = 0;
while (iterator.hasNext()) {
out[i++] = iterator.next();
}
return out;
}
}

View File

@ -17,6 +17,7 @@
package org.apache.commons.math4.stat.inference;
import java.lang.reflect.Method;
import java.util.Arrays;
import org.apache.commons.math4.TestUtils;
@ -27,6 +28,7 @@ import org.apache.commons.math4.random.Well19937c;
import org.apache.commons.math4.stat.inference.KolmogorovSmirnovTest;
import org.apache.commons.math4.util.CombinatoricsUtils;
import org.apache.commons.math4.util.FastMath;
import org.apache.commons.math4.util.MathArrays;
import org.junit.Assert;
import org.junit.Test;
@ -567,6 +569,63 @@ public class KolmogorovSmirnovTestTest {
Assert.assertEquals(0.06303, test.bootstrap(x, y, 10000, false), 1E-2);
}
@Test
public void testFixTiesNoOp() throws Exception {
final double[] x = {0, 1, 2, 3, 4};
final double[] y = {5, 6, 7, 8};
final double[] origX = MathArrays.copyOf(x);
final double[] origY = MathArrays.copyOf(y);
fixTies(x,y);
Assert.assertArrayEquals(origX, x, 0);
Assert.assertArrayEquals(origY, y, 0);
}
/**
* Verify that fixTies is deterministic, i.e,
* x = x', y = y' => fixTies(x,y) = fixTies(x', y')
*/
@Test
public void testFixTiesConsistency() throws Exception {
final double[] x = {0, 1, 2, 3, 4, 2};
final double[] y = {5, 6, 7, 8, 1, 2};
final double[] xP = MathArrays.copyOf(x);
final double[] yP = MathArrays.copyOf(y);
checkFixTies(x, y);
final double[] fixedX = MathArrays.copyOf(x);
final double[] fixedY = MathArrays.copyOf(y);
checkFixTies(xP, yP);
Assert.assertArrayEquals(fixedX, xP, 0);
Assert.assertArrayEquals(fixedY, yP, 0);
}
@Test
public void testFixTies() throws Exception {
checkFixTies(new double[] {0, 1, 1, 4, 0}, new double[] {0, 5, 0.5, 0.55, 7});
checkFixTies(new double[] {1, 1, 1, 1, 1}, new double[] {1, 1});
checkFixTies(new double[] {1, 2, 3}, new double[] {1});
checkFixTies(new double[] {1, 1, 0, 1, 0}, new double[] {});
}
/**
* Checks that fixTies eliminates ties in the data but does not otherwise
* perturb the ordering.
*/
private void checkFixTies(double[] x, double[] y) throws Exception {
final double[] origCombined = MathArrays.concatenate(x, y);
fixTies(x, y);
Assert.assertFalse(hasTies(x, y));
final double[] combined = MathArrays.concatenate(x, y);
for (int i = 0; i < combined.length; i++) {
for (int j = 0; j < i; j++) {
Assert.assertTrue(combined[i] != combined[j]);
if (combined[i] < combined[j])
Assert.assertTrue(origCombined[i] < origCombined[j]
|| origCombined[i] == origCombined[j]);
}
}
}
/**
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
* m, false).
@ -601,4 +660,24 @@ public class KolmogorovSmirnovTestTest {
Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon);
}
/**
* Reflection hack to expose private fixTies method for testing.
*/
private static void fixTies(double[] x, double[] y) throws Exception {
Method method = KolmogorovSmirnovTest.class.getDeclaredMethod("fixTies",
double[].class, double[].class);
method.setAccessible(true);
method.invoke(KolmogorovSmirnovTest.class, x, y);
}
/**
* Reflection hack to expose private hasTies method.
*/
private static boolean hasTies(double[] x, double[] y) throws Exception {
Method method = KolmogorovSmirnovTest.class.getDeclaredMethod("hasTies",
double[].class, double[].class);
method.setAccessible(true);
return (boolean) method.invoke(KolmogorovSmirnovTest.class, x, y);
}
}

View File

@ -37,7 +37,7 @@ import org.junit.Test;
*
*/
public class MathArraysTest {
private double[] testArray = {0, 1, 2, 3, 4, 5};
private double[] testWeightsArray = {0.3, 0.2, 1.3, 1.1, 1.0, 1.8};
private double[] testNegativeWeightsArray = {-0.3, 0.2, -1.3, 1.1, 1.0, 1.8};
@ -49,7 +49,7 @@ public class MathArraysTest {
final double[] test = new double[] { -2.5, -1, 0, 1, 2.5 };
final double[] correctTest = MathArrays.copyOf(test);
final double[] correctScaled = new double[]{5.25, 2.1, 0, -2.1, -5.25};
final double[] scaled = MathArrays.scale(-2.1, test);
// Make sure test has not changed
@ -62,7 +62,7 @@ public class MathArraysTest {
Assert.assertEquals(correctScaled[i], scaled[i], 0);
}
}
@Test
public void testScaleInPlace() {
final double[] test = new double[] { -2.5, -1, 0, 1, 2.5 };
@ -74,7 +74,7 @@ public class MathArraysTest {
Assert.assertEquals(correctScaled[i], test[i], 0);
}
}
@Test(expected=DimensionMismatchException.class)
public void testEbeAddPrecondition() {
MathArrays.ebeAdd(new double[3], new double[4]);
@ -353,7 +353,7 @@ public class MathArraysTest {
new Double(-27.5) },
MathArrays.OrderDirection.DECREASING, false));
}
@Test
public void testCheckRectangular() {
final long[][] rect = new long[][] {{0, 1}, {2, 3}};
@ -373,9 +373,9 @@ public class MathArraysTest {
Assert.fail("Expecting NullArgumentException");
} catch (NullArgumentException ex) {
// Expected
}
}
}
@Test
public void testCheckPositive() {
final double[] positive = new double[] {1, 2, 3};
@ -397,7 +397,7 @@ public class MathArraysTest {
// Expected
}
}
@Test
public void testCheckNonNegative() {
final long[] nonNegative = new long[] {0, 1};
@ -419,7 +419,7 @@ public class MathArraysTest {
// Expected
}
}
@Test
public void testCheckNonNegative2D() {
final long[][] nonNegative = new long[][] {{0, 1}, {1, 0}};
@ -554,7 +554,7 @@ public class MathArraysTest {
Assert.assertEquals(25, x2[0], FastMath.ulp(1d));
Assert.assertEquals(125, x3[0], FastMath.ulp(1d));
}
@Test
/** Example in javadoc */
public void testSortInPlaceExample() {
@ -569,7 +569,7 @@ public class MathArraysTest {
Assert.assertTrue(Arrays.equals(sy, y));
Assert.assertTrue(Arrays.equals(sz, z));
}
@Test
public void testSortInPlaceFailures() {
final double[] nullArray = null;
@ -1047,7 +1047,7 @@ public class MathArraysTest {
Assert.fail("expecting MathIllegalArgumentException");
} catch (MathIllegalArgumentException ex) {}
}
@Test
public void testConvolve() {
/* Test Case (obtained via SciPy)
@ -1066,10 +1066,10 @@ public class MathArraysTest {
double[] x2 = { 1, 2, 3 };
double[] h2 = { 0, 1, 0.5 };
double[] y2 = { 0, 1, 2.5, 4, 1.5 };
yActual = MathArrays.convolve(x2, h2);
Assert.assertArrayEquals(y2, yActual, tolerance);
try {
MathArrays.convolve(new double[]{1, 2}, null);
Assert.fail("an exception should have been thrown");
@ -1187,7 +1187,7 @@ public class MathArraysTest {
final int[] seq = MathArrays.sequence(0, 12345, 6789);
Assert.assertEquals(0, seq.length);
}
@Test
public void testVerifyValuesPositive() {
for (int j = 0; j < 6; j++) {
@ -1252,4 +1252,69 @@ public class MathArraysTest {
// expected
}
}
@Test
public void testConcatenate() {
final double[] u = new double[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
final double[] x = new double[] {0, 1, 2};
final double[] y = new double[] {3, 4, 5, 6, 7, 8};
final double[] z = new double[] {9};
Assert.assertArrayEquals(u, MathArrays.concatenate(x, y, z), 0);
}
@Test
public void testConcatentateSingle() {
final double[] x = new double[] {0, 1, 2};
Assert.assertArrayEquals(x, MathArrays.concatenate(x), 0);
}
public void testConcatenateEmptyArguments() {
final double[] x = new double[] {0, 1, 2};
final double[] y = new double[] {3};
final double[] z = new double[] {};
final double[] u = new double[] {0, 1, 2, 3};
Assert.assertArrayEquals(u, MathArrays.concatenate(x, z, y), 0);
Assert.assertArrayEquals(u, MathArrays.concatenate(x, y, z), 0);
Assert.assertArrayEquals(u, MathArrays.concatenate(z, x, y), 0);
Assert.assertEquals(0, MathArrays.concatenate(z, z, z).length);
}
@Test(expected=NullPointerException.class)
public void testConcatenateNullArguments() {
final double[] x = new double[] {0, 1, 2};
MathArrays.concatenate(x, null);
}
@Test
public void testUnique() {
final double[] x = {0, 9, 3, 0, 11, 7, 3, 5, -1, -2};
final double[] values = {11, 9, 7, 5, 3, 0, -1, -2};
Assert.assertArrayEquals(values, MathArrays.unique(x), 0);
}
@Test
public void testUniqueInfiniteValues() {
final double [] x = {0, Double.NEGATIVE_INFINITY, 3, Double.NEGATIVE_INFINITY,
3, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY};
final double[] u = {Double.POSITIVE_INFINITY, 3, 0, Double.NEGATIVE_INFINITY};
Assert.assertArrayEquals(u , MathArrays.unique(x), 0);
}
@Test
public void testUniqueNaNValues() {
final double[] x = new double[] {10, 2, Double.NaN, Double.NaN, Double.NaN,
Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY};
final double[] u = MathArrays.unique(x);
Assert.assertEquals(5, u.length);
Assert.assertTrue(Double.isNaN(u[0]));
Assert.assertEquals(Double.POSITIVE_INFINITY, u[1], 0);
Assert.assertEquals(10, u[2], 0);
Assert.assertEquals(2, u[3], 0);
Assert.assertEquals(Double.NEGATIVE_INFINITY, u[4], 0);
}
@Test(expected=NullPointerException.class)
public void testUniqueNullArgument() {
MathArrays.unique(null);
}
}