Merge branch 'MATH_3_X' of https://git-wip-us.apache.org/repos/asf/commons-math into MATH_3_X
This commit is contained in:
commit
88cee6d7b3
|
@ -51,6 +51,9 @@ If the output is not quite correct, check for invisible trailing spaces!
|
|||
</properties>
|
||||
<body>
|
||||
<release version="3.6" date="XXXX-XX-XX" description="">
|
||||
<action dev="oertl" type="update" issue="MATH-1274">
|
||||
Representation of Kolmogorov-Smirnov statistic as integral value.
|
||||
</action>
|
||||
<action dev="luc" type="add" issue="MATH-1273" due-to="Qualtagh">
|
||||
Added negative zero support in FastMath.pow.
|
||||
</action>
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.math.BigDecimal;
|
|||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.math3.util.Precision;
|
||||
import org.apache.commons.math3.distribution.RealDistribution;
|
||||
import org.apache.commons.math3.exception.InsufficientDataException;
|
||||
import org.apache.commons.math3.exception.MathArithmeticException;
|
||||
|
@ -220,7 +219,10 @@ public class KolmogorovSmirnovTest {
|
|||
* {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
|
||||
* is accomplished by enumerating all partitions of the combined sample into two subsamples of
|
||||
* the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
|
||||
* proportion of partitions that give \(D\) values exceeding the observed value.</li>
|
||||
* proportion of partitions that give \(D\) values exceeding the observed value. In the very
|
||||
* small sample case, if there are ties in the data, the actual sample values (including ties)
|
||||
* are used in generating the partitions (which are basically multi-set partitions in this
|
||||
* case).</li>
|
||||
* <li>For mid-size samples (product of sample sizes greater than or equal to
|
||||
* {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
|
||||
* simulation is used to compute the p-value. The simulation randomly generates partitions and
|
||||
|
@ -243,10 +245,10 @@ public class KolmogorovSmirnovTest {
|
|||
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
|
||||
final long lengthProduct = (long) x.length * y.length;
|
||||
if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
|
||||
return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
|
||||
return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length);
|
||||
}
|
||||
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
|
||||
return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS);
|
||||
return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
|
||||
}
|
||||
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
|
||||
}
|
||||
|
@ -285,6 +287,25 @@ public class KolmogorovSmirnovTest {
|
|||
* @throws NullArgumentException if either {@code x} or {@code y} is null
|
||||
*/
|
||||
public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
|
||||
return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
|
||||
* where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
|
||||
* empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
|
||||
* is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
|
||||
* as long value.
|
||||
*
|
||||
* @param x first sample
|
||||
* @param y second sample
|
||||
* @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
|
||||
* {@code y} represent samples from the same underlying distribution
|
||||
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
|
||||
* least 2
|
||||
* @throws NullArgumentException if either {@code x} or {@code y} is null
|
||||
*/
|
||||
private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
|
||||
checkArray(x);
|
||||
checkArray(y);
|
||||
// Copy and sort the sample arrays
|
||||
|
@ -297,23 +318,26 @@ public class KolmogorovSmirnovTest {
|
|||
|
||||
int rankX = 0;
|
||||
int rankY = 0;
|
||||
long curD = 0l;
|
||||
|
||||
// Find the max difference between cdf_x and cdf_y
|
||||
double supD = 0d;
|
||||
long supD = 0l;
|
||||
do {
|
||||
double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
|
||||
while(rankX < n && Double.compare(sx[rankX], z) == 0) {
|
||||
rankX += 1;
|
||||
curD += m;
|
||||
}
|
||||
while(rankY < m && Double.compare(sy[rankY], z) == 0) {
|
||||
rankY += 1;
|
||||
curD -= n;
|
||||
}
|
||||
final double cdf_x = rankX / (double) n;
|
||||
final double cdf_y = rankY / (double) m;
|
||||
final double curD = FastMath.abs(cdf_x - cdf_y);
|
||||
if (curD > supD) {
|
||||
supD = curD;
|
||||
}
|
||||
else if (-curD > supD) {
|
||||
supD = -curD;
|
||||
}
|
||||
} while(rankX < n && rankY < m);
|
||||
return supD;
|
||||
}
|
||||
|
@ -857,6 +881,32 @@ public class KolmogorovSmirnovTest {
|
|||
return partialSum * 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
|
||||
* an integral d-statistic in the range [0, n*m] is calculated, that can be used for
|
||||
* comparison with other integral d-statistics. Depending whether {@code strict} is
|
||||
* {@code true} or not, the returned value divided by (n*m) is greater than
|
||||
* (resp greater than or equal to) the given d value (allowing some tolerance).
|
||||
*
|
||||
* @param d a d-statistic in the range [0, 1]
|
||||
* @param n first sample size
|
||||
* @param m second sample size
|
||||
* @param strict whether the returned value divided by (n*m) is allowed to be equal to d
|
||||
* @return the integral d-statistic in the range [0, n*m]
|
||||
*/
|
||||
private static long calculateIntegralD(double d, int n, int m, boolean strict) {
|
||||
final double tol = 1e-12; // d-values within tol of one another are considered equal
|
||||
long nm = n * (long)m;
|
||||
long upperBound = (long)FastMath.ceil((d - tol) * nm);
|
||||
long lowerBound = (long)FastMath.floor((d + tol) * nm);
|
||||
if (strict && lowerBound == upperBound) {
|
||||
return upperBound + 1l;
|
||||
}
|
||||
else {
|
||||
return upperBound;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
|
||||
* d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
|
||||
|
@ -882,11 +932,28 @@ public class KolmogorovSmirnovTest {
|
|||
* greater than (resp. greater than or equal to) {@code d}
|
||||
*/
|
||||
public double exactP(double d, int n, int m, boolean strict) {
|
||||
return integralExactP(calculateIntegralD(d, n, m, strict), n, m);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the
|
||||
* 2-sample Kolmogorov-Smirnov statistic.
|
||||
* <p>
|
||||
* Here d is the D-statistic represented as long value.
|
||||
* The real D-statistic is obtained by dividing d by n*m.
|
||||
* See also {@link #exactP(double, int, int, boolean)}.
|
||||
*
|
||||
* @param d integral D-statistic
|
||||
* @param n first sample size
|
||||
* @param m second sample size
|
||||
* @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
|
||||
* greater than or equal to {@code d/(n*m)}
|
||||
*/
|
||||
private double integralExactP(long d, int n, int m) {
|
||||
Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
|
||||
long tail = 0;
|
||||
final double[] nSet = new double[n];
|
||||
final double[] mSet = new double[m];
|
||||
final double tol = 1e-12; // d-values within tol of one another are considered equal
|
||||
while (combinationsIterator.hasNext()) {
|
||||
// Generate an n-set
|
||||
final int[] nSetI = combinationsIterator.next();
|
||||
|
@ -900,9 +967,67 @@ public class KolmogorovSmirnovTest {
|
|||
mSet[k++] = i;
|
||||
}
|
||||
}
|
||||
final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
|
||||
final int order = Precision.compareTo(curD, d, tol);
|
||||
if (order > 0 || (order == 0 && !strict)) {
|
||||
final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
|
||||
if (curD >= d) {
|
||||
tail++;
|
||||
}
|
||||
}
|
||||
return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the exact p value for a two-sample Kolmogorov-Smirnov test with
|
||||
* {@code x} and {@code y} as samples, possibly containing ties. This method
|
||||
* uses the same implementation as {@link #exactP(double, int, int, boolean)}
|
||||
* with the exception that it examines partitions of the combined sample,
|
||||
* preserving ties in the data. What is returned is the exact probability
|
||||
* that a random partition of the combined dataset into a subset of size
|
||||
* {@code x.length} and another of size {@code y.length} yields a \(D\)
|
||||
* value greater than (resp greater than or equal to) \(D(x,y)\).
|
||||
* <p>
|
||||
* This method should not be used on large samples (a good rule of thumb is
|
||||
* to keep the product of the sample sizes less than
|
||||
* {@link #SMALL_SAMPLE_PRODUCT} when using this method). If the data do
|
||||
* not contain ties, {@link #exactP(double[], double[], boolean)} should be
|
||||
* used instead of this method.</p>
|
||||
*
|
||||
* @param x first sample
|
||||
* @param y second sample
|
||||
* @param strict whether or not the inequality in the null hypothesis is strict
|
||||
* @return p-value
|
||||
*/
|
||||
public double exactP(double[] x, double[] y, boolean strict) {
|
||||
final long d = integralKolmogorovSmirnovStatistic(x, y);
|
||||
final int n = x.length;
|
||||
final int m = y.length;
|
||||
|
||||
// Concatenate x and y into universe, preserving ties in the data
|
||||
final double[] universe = new double[n + m];
|
||||
System.arraycopy(x, 0, universe, 0, n);
|
||||
System.arraycopy(y, 0, universe, n, m);
|
||||
|
||||
// Iterate over all n, m partitions of the n + m elements in the universe,
|
||||
// Computing D for each one
|
||||
Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
|
||||
long tail = 0;
|
||||
final double[] nSet = new double[n];
|
||||
final double[] mSet = new double[m];
|
||||
while (combinationsIterator.hasNext()) {
|
||||
// Generate an n-set
|
||||
final int[] nSetI = combinationsIterator.next();
|
||||
// Copy the elements of the universe in the n-set to nSet
|
||||
// and the others to mSet
|
||||
int j = 0;
|
||||
int k = 0;
|
||||
for (int i = 0; i < n + m; i++) {
|
||||
if (j < n && nSetI[j] == i) {
|
||||
nSet[j++] = universe[i];
|
||||
} else {
|
||||
mSet[k++] = universe[i];
|
||||
}
|
||||
}
|
||||
final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
|
||||
if (curD > d || (curD == d && !strict)) {
|
||||
tail++;
|
||||
}
|
||||
}
|
||||
|
@ -974,35 +1099,49 @@ public class KolmogorovSmirnovTest {
|
|||
*/
|
||||
public double monteCarloP(final double d, final int n, final int m, final boolean strict,
|
||||
final int iterations) {
|
||||
return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
|
||||
* 2-sample Kolmogorov-Smirnov statistic.
|
||||
* <p>
|
||||
* Here d is the D-statistic represented as long value.
|
||||
* The real D-statistic is obtained by dividing d by n*m.
|
||||
* See also {@link #monteCarloP(double, int, int, boolean, int)}.
|
||||
*
|
||||
* @param d integral D-statistic
|
||||
* @param n first sample size
|
||||
* @param m second sample size
|
||||
* @param iterations number of random partitions to generate
|
||||
* @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
|
||||
* greater than or equal to {@code d/(n*m))}
|
||||
*/
|
||||
private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) {
|
||||
|
||||
// ensure that nn is always the max of (n, m) to require fewer random numbers
|
||||
final int nn = FastMath.max(n, m);
|
||||
final int mm = FastMath.min(n, m);
|
||||
final int sum = nn + mm;
|
||||
final double tol = 1e-12; // d-values within tol of one another are considered equal
|
||||
|
||||
int tail = 0;
|
||||
final boolean b[] = new boolean[sum];
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
|
||||
int rankN = b[0] ? 1 : 0;
|
||||
int rankM = b[0] ? 0 : 1;
|
||||
boolean previous = b[0];
|
||||
for(int j = 1; j < b.length; ++j) {
|
||||
if (b[j] != previous) {
|
||||
final double cdf_n = rankN / (double) nn;
|
||||
final double cdf_m = rankM / (double) mm;
|
||||
final double curD = FastMath.abs(cdf_n - cdf_m);
|
||||
final int order = Precision.compareTo(curD, d, tol);
|
||||
if (order > 0 || (order == 0 && !strict)) {
|
||||
long curD = 0l;
|
||||
for(int j = 0; j < b.length; ++j) {
|
||||
if (b[j]) {
|
||||
curD += mm;
|
||||
if (curD >= d) {
|
||||
tail++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
previous = b[j];
|
||||
if (b[j]) {
|
||||
rankN++;
|
||||
} else {
|
||||
rankM++;
|
||||
curD -= nn;
|
||||
if (curD <= -d) {
|
||||
tail++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue