MATH-1405
As per the discussion on the JIRA page: * Modified method "hasTies" in order to use a single way for detecting ties. * Modified method "jitter": Add a random value that will modify the least significant bits of the number to be jittered. * Simplified method "fixTies". * When NaN values are present in the data, an exception is thrown. Formatting of Javadoc. Added unit tests.
This commit is contained in:
parent
e280c86367
commit
b0b23c179a
|
@ -31,6 +31,7 @@ import org.apache.commons.math4.exception.NullArgumentException;
|
||||||
import org.apache.commons.math4.exception.NumberIsTooLargeException;
|
import org.apache.commons.math4.exception.NumberIsTooLargeException;
|
||||||
import org.apache.commons.math4.exception.OutOfRangeException;
|
import org.apache.commons.math4.exception.OutOfRangeException;
|
||||||
import org.apache.commons.math4.exception.TooManyIterationsException;
|
import org.apache.commons.math4.exception.TooManyIterationsException;
|
||||||
|
import org.apache.commons.math4.exception.NotANumberException;
|
||||||
import org.apache.commons.math4.exception.util.LocalizedFormats;
|
import org.apache.commons.math4.exception.util.LocalizedFormats;
|
||||||
import org.apache.commons.math4.fraction.BigFraction;
|
import org.apache.commons.math4.fraction.BigFraction;
|
||||||
import org.apache.commons.math4.fraction.BigFractionField;
|
import org.apache.commons.math4.fraction.BigFractionField;
|
||||||
|
@ -243,15 +244,16 @@ public class KolmogorovSmirnovTest {
|
||||||
* If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
|
* If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
|
||||||
* may be used as an alternative method for estimating the p-value.</p>
|
* may be used as an alternative method for estimating the p-value.</p>
|
||||||
*
|
*
|
||||||
* @param x first sample dataset
|
* @param x first sample dataset.
|
||||||
* @param y second sample dataset
|
* @param y second sample dataset.
|
||||||
* @param strict whether or not the probability to compute is expressed as a strict inequality
|
* @param strict whether or not the probability to compute is expressed as
|
||||||
* (ignored for large samples)
|
* a strict inequality (ignored for large samples).
|
||||||
* @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
|
* @return p-value associated with the null hypothesis that {@code x} and
|
||||||
* samples from the same distribution
|
* {@code y} represent samples from the same distribution.
|
||||||
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
|
* @throws InsufficientDataException if either {@code x} or {@code y} does
|
||||||
* least 2
|
* not have length at least 2.
|
||||||
* @throws NullArgumentException if either {@code x} or {@code y} is null
|
* @throws NullArgumentException if either {@code x} or {@code y} is null.
|
||||||
|
* @throws NotANumberException if the input arrays contain NaN values.
|
||||||
* @see #bootstrap(double[], double[], int, boolean)
|
* @see #bootstrap(double[], double[], int, boolean)
|
||||||
*/
|
*/
|
||||||
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
|
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
|
||||||
|
@ -1121,80 +1123,61 @@ public class KolmogorovSmirnovTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If there are no ties in the combined dataset formed from x and y, this
|
* If there are no ties in the combined dataset formed from x and y,
|
||||||
* method is a no-op. If there are ties, a uniform random deviate in
|
* this method is a no-op.
|
||||||
* (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
|
* If there are ties, a uniform random deviate in
|
||||||
* minDelta is the minimum difference between unequal values in the combined
|
* is added to each value in x and y, and this method overwrites the
|
||||||
* sample. A fixed seed is used to generate the jitter, so repeated activations
|
* data in x and y with the jittered values.
|
||||||
* with the same input arrays result in the same values.
|
|
||||||
*
|
*
|
||||||
* NOTE: if there are ties in the data, this method overwrites the data in
|
* @param x First sample.
|
||||||
* x and y with the jittered values.
|
* @param y Second sample.
|
||||||
*
|
* @throw NotANumberException if any of the input arrays contain
|
||||||
* @param x first sample
|
* a NaN value.
|
||||||
* @param y second sample
|
|
||||||
*/
|
*/
|
||||||
private static void fixTies(double[] x, double[] y) {
|
private static void fixTies(double[] x, double[] y) {
|
||||||
final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
|
if (hasTies(x, y)) {
|
||||||
if (values.length == x.length + y.length) {
|
// Add jitter using a fixed seed (so same arguments always give same results),
|
||||||
return; // There are no ties
|
// low-initialization-overhead generator.
|
||||||
}
|
final UniformRandomProvider rng = RandomSource.create(RandomSource.TWO_CMRES, 7654321);
|
||||||
|
|
||||||
// Find the smallest difference between values, or 1 if all values are the same
|
// It is theoretically possible that jitter does not break ties, so repeat
|
||||||
double minDelta = 1;
|
// until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
|
||||||
double prev = values[0];
|
int ct = 0;
|
||||||
double delta = 1;
|
boolean ties = true;
|
||||||
for (int i = 1; i < values.length; i++) {
|
do {
|
||||||
delta = prev - values[i];
|
jitter(x, rng, 10);
|
||||||
if (delta < minDelta) {
|
jitter(y, rng, 10);
|
||||||
minDelta = delta;
|
ties = hasTies(x, y);
|
||||||
}
|
++ct;
|
||||||
prev = values[i];
|
} while (ties && ct < 10);
|
||||||
}
|
if (ties) {
|
||||||
minDelta /= 2;
|
throw new MathInternalError(); // Should never happen.
|
||||||
|
}
|
||||||
// Add jitter using a fixed seed (so same arguments always give same results),
|
}
|
||||||
// low-initialization-overhead generator.
|
|
||||||
final UniformRandomProvider rng = RandomSource.create(RandomSource.TWO_CMRES, 654321);
|
|
||||||
|
|
||||||
// It is theoretically possible that jitter does not break ties, so repeat
|
|
||||||
// until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
|
|
||||||
int ct = 0;
|
|
||||||
boolean ties = true;
|
|
||||||
do {
|
|
||||||
jitter(x, rng, minDelta);
|
|
||||||
jitter(y, rng, minDelta);
|
|
||||||
ties = hasTies(x, y);
|
|
||||||
ct++;
|
|
||||||
// If jittering hasn't resolved ties, "minDelta" may be too small.
|
|
||||||
minDelta *= 2;
|
|
||||||
} while (ties && ct < 1000);
|
|
||||||
if (ties) {
|
|
||||||
throw new MathInternalError(); // Should never happen.
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true iff there are ties in the combined sample
|
* Returns true iff there are ties in the combined sample formed from
|
||||||
* formed from x and y.
|
* x and y.
|
||||||
*
|
*
|
||||||
* @param x first sample
|
* @param x First sample.
|
||||||
* @param y second sample
|
* @param y Second sample.
|
||||||
* @return true if x and y together contain ties
|
* @return true if x and y together contain ties.
|
||||||
|
* @throw NotANumberException if any of the input arrays contain
|
||||||
|
* a NaN value.
|
||||||
*/
|
*/
|
||||||
private static boolean hasTies(double[] x, double[] y) {
|
private static boolean hasTies(double[] x, double[] y) {
|
||||||
final HashSet<Double> values = new HashSet<>();
|
final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
|
||||||
for (int i = 0; i < x.length; i++) {
|
|
||||||
if (!values.add(x[i])) {
|
// "unique" moves NaN to the head of the output array.
|
||||||
return true;
|
if (Double.isNaN(values[0])) {
|
||||||
}
|
throw new NotANumberException();
|
||||||
}
|
}
|
||||||
for (int i = 0; i < y.length; i++) {
|
if (values.length == x.length + y.length) {
|
||||||
if (!values.add(y[i])) {
|
return false; // There are no ties.
|
||||||
return true;
|
}
|
||||||
}
|
|
||||||
}
|
return true;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1207,10 +1190,13 @@ public class KolmogorovSmirnovTest {
|
||||||
* @param sampler probability distribution to sample for jitter values
|
* @param sampler probability distribution to sample for jitter values
|
||||||
* @throws NullPointerException if either of the parameters is null
|
* @throws NullPointerException if either of the parameters is null
|
||||||
*/
|
*/
|
||||||
private static void jitter(final double[] data, final UniformRandomProvider rng, final double delta) {
|
private static void jitter(double[] data,
|
||||||
|
UniformRandomProvider rng,
|
||||||
|
int ulp) {
|
||||||
|
final int range = ulp * 2;
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
final double d = delta * (2 * rng.nextDouble() - 1);
|
final int rand = rng.nextInt(range) - ulp;
|
||||||
data[i] += d;
|
data[i] += rand * Math.ulp(data[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.commons.rng.UniformRandomProvider;
|
||||||
import org.apache.commons.math4.util.CombinatoricsUtils;
|
import org.apache.commons.math4.util.CombinatoricsUtils;
|
||||||
import org.apache.commons.math4.util.FastMath;
|
import org.apache.commons.math4.util.FastMath;
|
||||||
import org.apache.commons.math4.util.MathArrays;
|
import org.apache.commons.math4.util.MathArrays;
|
||||||
|
import org.apache.commons.math4.exception.NotANumberException;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -440,29 +441,92 @@ public class KolmogorovSmirnovTestTest {
|
||||||
@Test
|
@Test
|
||||||
public void testTwoSampleWithManyTiesAndVerySmallDelta() {
|
public void testTwoSampleWithManyTiesAndVerySmallDelta() {
|
||||||
// Cf. MATH-1405
|
// Cf. MATH-1405
|
||||||
|
|
||||||
final double[] x = {
|
final double[] x = {
|
||||||
0.000000, 0.000000, 1.000000,
|
0.0, 0.0,
|
||||||
1.000000, 1.500000, 1.600000,
|
1.0, 1.0,
|
||||||
1.700000, 1.800000, 1.900000, 2.000000, 2.000000000000001
|
1.5,
|
||||||
|
1.6,
|
||||||
|
1.7,
|
||||||
|
1.8,
|
||||||
|
1.9,
|
||||||
|
2.0,
|
||||||
|
2.000000000000001
|
||||||
};
|
};
|
||||||
|
|
||||||
final double[] y = {
|
final double[] y = {
|
||||||
0.000000, 0.000000, 10.000000,
|
0.0, 0.0,
|
||||||
10.000000, 11.000000, 11.000000,
|
10.0, 10.0,
|
||||||
11.000000, 15.000000, 16.000000,
|
11.0, 11.0, 11.0,
|
||||||
17.000000, 18.000000, 19.000000, 20.000000, 20.000000000000001
|
15.0,
|
||||||
|
16.0,
|
||||||
|
17.0,
|
||||||
|
18.0,
|
||||||
|
19.0,
|
||||||
|
20.0,
|
||||||
|
20.000000000000001
|
||||||
};
|
};
|
||||||
|
|
||||||
// These values result in an initial calculated minDelta of 4.440892098500626E-16,
|
|
||||||
// which is too small to jitter the existing values to new ones bc of floating-point
|
|
||||||
// precision.
|
|
||||||
// MATH-1405 adds functionality to iteratively increase minDelta until a noticeable
|
|
||||||
// jitter occurs.
|
|
||||||
|
|
||||||
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
|
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
|
||||||
Assert.assertEquals(1.12173015e-5, test.kolmogorovSmirnovTest(x, y), 1e-6);
|
Assert.assertEquals(1.12173015e-5, test.kolmogorovSmirnovTest(x, y), 1e-6);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTwoSampleWithManyTiesAndExtremeValues() {
|
||||||
|
// Cf. MATH-1405
|
||||||
|
|
||||||
|
final double[] largeX = {
|
||||||
|
Double.MAX_VALUE, Double.MAX_VALUE,
|
||||||
|
1e40, 1e40,
|
||||||
|
2e40, 2e40,
|
||||||
|
1e30,
|
||||||
|
2e30,
|
||||||
|
3e30,
|
||||||
|
4e30,
|
||||||
|
5e10,
|
||||||
|
6e10,
|
||||||
|
7e10,
|
||||||
|
8e10
|
||||||
|
};
|
||||||
|
|
||||||
|
final double[] smallY = {
|
||||||
|
Double.MIN_VALUE,
|
||||||
|
2 * Double.MIN_VALUE,
|
||||||
|
1e-40, 1e-40,
|
||||||
|
2e-40, 2e-40,
|
||||||
|
1e-30,
|
||||||
|
2e-30,
|
||||||
|
3e-30,
|
||||||
|
4e-30,
|
||||||
|
5e-10,
|
||||||
|
6e-10,
|
||||||
|
7e-10,
|
||||||
|
8e-10
|
||||||
|
};
|
||||||
|
|
||||||
|
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
|
||||||
|
Assert.assertEquals(0, test.kolmogorovSmirnovTest(largeX, smallY), 1e-10);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected=NotANumberException.class)
|
||||||
|
public void testTwoSampleWithTiesAndNaN1() {
|
||||||
|
// Cf. MATH-1405
|
||||||
|
|
||||||
|
final double[] x = { 1, Double.NaN, 3, 4 };
|
||||||
|
final double[] y = { 1, 2, 3, 4 };
|
||||||
|
new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected=NotANumberException.class)
|
||||||
|
public void testTwoSampleWithTiesAndNaN2() {
|
||||||
|
// Cf. MATH-1405
|
||||||
|
|
||||||
|
final double[] x = { 1, 2, 3, 4 };
|
||||||
|
final double[] y = { 1, 2, Double.NaN, 4 };
|
||||||
|
|
||||||
|
new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTwoSamplesAllEqual() {
|
public void testTwoSamplesAllEqual() {
|
||||||
int iterations = 10_000;
|
int iterations = 10_000;
|
||||||
|
|
Loading…
Reference in New Issue