Merge branch 'MATH-1405_Iteratively-double-minDelta-for-KSTest-if-too-small'

Completes issue MATH-1405 (see JIRA).
This commit is contained in:
Gilles 2017-03-07 13:15:20 +01:00
commit bddcfd5a57
2 changed files with 139 additions and 89 deletions

View File

@ -31,6 +31,7 @@ import org.apache.commons.math4.exception.NullArgumentException;
import org.apache.commons.math4.exception.NumberIsTooLargeException; import org.apache.commons.math4.exception.NumberIsTooLargeException;
import org.apache.commons.math4.exception.OutOfRangeException; import org.apache.commons.math4.exception.OutOfRangeException;
import org.apache.commons.math4.exception.TooManyIterationsException; import org.apache.commons.math4.exception.TooManyIterationsException;
import org.apache.commons.math4.exception.NotANumberException;
import org.apache.commons.math4.exception.util.LocalizedFormats; import org.apache.commons.math4.exception.util.LocalizedFormats;
import org.apache.commons.math4.fraction.BigFraction; import org.apache.commons.math4.fraction.BigFraction;
import org.apache.commons.math4.fraction.BigFractionField; import org.apache.commons.math4.fraction.BigFractionField;
@ -243,15 +244,16 @@ public class KolmogorovSmirnovTest {
* If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)} * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
* may be used as an alternative method for estimating the p-value.</p> * may be used as an alternative method for estimating the p-value.</p>
* *
* @param x first sample dataset * @param x first sample dataset.
* @param y second sample dataset * @param y second sample dataset.
* @param strict whether or not the probability to compute is expressed as a strict inequality * @param strict whether or not the probability to compute is expressed as
* (ignored for large samples) * a strict inequality (ignored for large samples).
* @return p-value associated with the null hypothesis that {@code x} and {@code y} represent * @return p-value associated with the null hypothesis that {@code x} and
* samples from the same distribution * {@code y} represent samples from the same distribution.
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at * @throws InsufficientDataException if either {@code x} or {@code y} does
* least 2 * not have length at least 2.
* @throws NullArgumentException if either {@code x} or {@code y} is null * @throws NullArgumentException if either {@code x} or {@code y} is null.
* @throws NotANumberException if the input arrays contain NaN values.
* @see #bootstrap(double[], double[], int, boolean) * @see #bootstrap(double[], double[], int, boolean)
*/ */
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
@ -1121,80 +1123,61 @@ public class KolmogorovSmirnovTest {
} }
/** /**
* If there are no ties in the combined dataset formed from x and y, this * If there are no ties in the combined dataset formed from x and y,
* method is a no-op. If there are ties, a uniform random deviate in * this method is a no-op.
* (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where * If there are ties, a uniform random deviate in
* minDelta is the minimum difference between unequal values in the combined * is added to each value in x and y, and this method overwrites the
* sample. A fixed seed is used to generate the jitter, so repeated activations * data in x and y with the jittered values.
* with the same input arrays result in the same values.
* *
* NOTE: if there are ties in the data, this method overwrites the data in * @param x First sample.
* x and y with the jittered values. * @param y Second sample.
* * @throw NotANumberException if any of the input arrays contain
* @param x first sample * a NaN value.
* @param y second sample
*/ */
private static void fixTies(double[] x, double[] y) { private static void fixTies(double[] x, double[] y) {
final double[] values = MathArrays.unique(MathArrays.concatenate(x,y)); if (hasTies(x, y)) {
if (values.length == x.length + y.length) { // Add jitter using a fixed seed (so same arguments always give same results),
return; // There are no ties // low-initialization-overhead generator.
} final UniformRandomProvider rng = RandomSource.create(RandomSource.TWO_CMRES, 7654321);
// Find the smallest difference between values, or 1 if all values are the same // It is theoretically possible that jitter does not break ties, so repeat
double minDelta = 1; // until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
double prev = values[0]; int ct = 0;
double delta = 1; boolean ties = true;
for (int i = 1; i < values.length; i++) { do {
delta = prev - values[i]; jitter(x, rng, 10);
if (delta < minDelta) { jitter(y, rng, 10);
minDelta = delta; ties = hasTies(x, y);
} ++ct;
prev = values[i]; } while (ties && ct < 10);
} if (ties) {
minDelta /= 2; throw new MathInternalError(); // Should never happen.
}
// Add jitter using a fixed seed (so same arguments always give same results), }
// low-initialization-overhead generator.
final UniformRandomProvider rng = RandomSource.create(RandomSource.TWO_CMRES, 654321);
// It is theoretically possible that jitter does not break ties, so repeat
// until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
int ct = 0;
boolean ties = true;
do {
jitter(x, rng, minDelta);
jitter(y, rng, minDelta);
ties = hasTies(x, y);
ct++;
// If jittering hasn't resolved ties, "minDelta" may be too small.
minDelta *= 2;
} while (ties && ct < 1000);
if (ties) {
throw new MathInternalError(); // Should never happen.
}
} }
/** /**
* Returns true iff there are ties in the combined sample * Returns true iff there are ties in the combined sample formed from
* formed from x and y. * x and y.
* *
* @param x first sample * @param x First sample.
* @param y second sample * @param y Second sample.
* @return true if x and y together contain ties * @return true if x and y together contain ties.
* @throw NotANumberException if any of the input arrays contain
* a NaN value.
*/ */
private static boolean hasTies(double[] x, double[] y) { private static boolean hasTies(double[] x, double[] y) {
final HashSet<Double> values = new HashSet<>(); final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
for (int i = 0; i < x.length; i++) {
if (!values.add(x[i])) { // "unique" moves NaN to the head of the output array.
return true; if (Double.isNaN(values[0])) {
} throw new NotANumberException();
} }
for (int i = 0; i < y.length; i++) { if (values.length == x.length + y.length) {
if (!values.add(y[i])) { return false; // There are no ties.
return true; }
}
} return true;
return false;
} }
/** /**
@ -1207,10 +1190,13 @@ public class KolmogorovSmirnovTest {
* @param sampler probability distribution to sample for jitter values * @param sampler probability distribution to sample for jitter values
* @throws NullPointerException if either of the parameters is null * @throws NullPointerException if either of the parameters is null
*/ */
private static void jitter(final double[] data, final UniformRandomProvider rng, final double delta) { private static void jitter(double[] data,
UniformRandomProvider rng,
int ulp) {
final int range = ulp * 2;
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
final double d = delta * (2 * rng.nextDouble() - 1); final int rand = rng.nextInt(range) - ulp;
data[i] += d; data[i] += rand * Math.ulp(data[i]);
} }
} }

View File

@ -28,6 +28,7 @@ import org.apache.commons.rng.UniformRandomProvider;
import org.apache.commons.math4.util.CombinatoricsUtils; import org.apache.commons.math4.util.CombinatoricsUtils;
import org.apache.commons.math4.util.FastMath; import org.apache.commons.math4.util.FastMath;
import org.apache.commons.math4.util.MathArrays; import org.apache.commons.math4.util.MathArrays;
import org.apache.commons.math4.exception.NotANumberException;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
@ -440,29 +441,92 @@ public class KolmogorovSmirnovTestTest {
@Test @Test
public void testTwoSampleWithManyTiesAndVerySmallDelta() { public void testTwoSampleWithManyTiesAndVerySmallDelta() {
// Cf. MATH-1405 // Cf. MATH-1405
final double[] x = { final double[] x = {
0.000000, 0.000000, 1.000000, 0.0, 0.0,
1.000000, 1.500000, 1.600000, 1.0, 1.0,
1.700000, 1.800000, 1.900000, 2.000000, 2.000000000000001 1.5,
1.6,
1.7,
1.8,
1.9,
2.0,
2.000000000000001
}; };
final double[] y = { final double[] y = {
0.000000, 0.000000, 10.000000, 0.0, 0.0,
10.000000, 11.000000, 11.000000, 10.0, 10.0,
11.000000, 15.000000, 16.000000, 11.0, 11.0, 11.0,
17.000000, 18.000000, 19.000000, 20.000000, 20.000000000000001 15.0,
16.0,
17.0,
18.0,
19.0,
20.0,
20.000000000000001
}; };
// These values result in an initial calculated minDelta of 4.440892098500626E-16,
// which is too small to jitter the existing values to new ones bc of floating-point
// precision.
// MATH-1405 adds functionality to iteratively increase minDelta until a noticeable
// jitter occurs.
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
Assert.assertEquals(1.12173015e-5, test.kolmogorovSmirnovTest(x, y), 1e-6); Assert.assertEquals(1.12173015e-5, test.kolmogorovSmirnovTest(x, y), 1e-6);
} }
@Test
public void testTwoSampleWithManyTiesAndExtremeValues() {
// Cf. MATH-1405
final double[] largeX = {
Double.MAX_VALUE, Double.MAX_VALUE,
1e40, 1e40,
2e40, 2e40,
1e30,
2e30,
3e30,
4e30,
5e10,
6e10,
7e10,
8e10
};
final double[] smallY = {
Double.MIN_VALUE,
2 * Double.MIN_VALUE,
1e-40, 1e-40,
2e-40, 2e-40,
1e-30,
2e-30,
3e-30,
4e-30,
5e-10,
6e-10,
7e-10,
8e-10
};
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
Assert.assertEquals(0, test.kolmogorovSmirnovTest(largeX, smallY), 1e-10);
}
@Test(expected=NotANumberException.class)
public void testTwoSampleWithTiesAndNaN1() {
// Cf. MATH-1405
final double[] x = { 1, Double.NaN, 3, 4 };
final double[] y = { 1, 2, 3, 4 };
new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
}
@Test(expected=NotANumberException.class)
public void testTwoSampleWithTiesAndNaN2() {
// Cf. MATH-1405
final double[] x = { 1, 2, 3, 4 };
final double[] y = { 1, 2, Double.NaN, 4 };
new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
}
@Test @Test
public void testTwoSamplesAllEqual() { public void testTwoSamplesAllEqual() {
int iterations = 10_000; int iterations = 10_000;