Allow direct use of SummaryStatistics in one-way ANOVA.

Patch provided by Peter Andrews.

JIRA: MATH-877

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1456958 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Luc Maisonobe 2013-03-15 13:55:27 +00:00
parent 2e66239120
commit 606fdac7df
4 changed files with 157 additions and 49 deletions

View File

@ -138,6 +138,9 @@
<contributor>
<name>Mark Anderson</name>
</contributor>
<contributor>
<name>Peter Andrews</name>
</contributor>
<contributor>
<name>R&#233;mi Arntzen</name>
</contributor>

View File

@ -55,6 +55,9 @@ This is a minor release: It combines bug fixes and new features.
Changes to existing features were made in a backwards-compatible
way such as to allow drop-in replacement of the v3.1[.1] JAR file.
">
<action dev="luc" type="update" issue="MATH-877" due-to="Peter Andrews">
Allow direct use of SummaryStatistics in one-way ANOVA.
</action>
<action dev="luc" type="fix" issue="MATH-947" >
Fixed infinite loop when NaN occurs in singular value decomposition.
</action>

View File

@ -16,6 +16,9 @@
*/
package org.apache.commons.math3.stat.inference;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.commons.math3.distribution.FDistribution;
import org.apache.commons.math3.exception.ConvergenceException;
import org.apache.commons.math3.exception.DimensionMismatchException;
@ -23,10 +26,8 @@ import org.apache.commons.math3.exception.MaxCountExceededException;
import org.apache.commons.math3.exception.NullArgumentException;
import org.apache.commons.math3.exception.OutOfRangeException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.stat.descriptive.summary.Sum;
import org.apache.commons.math3.stat.descriptive.summary.SumOfSquares;
import java.util.Collection;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.commons.math3.util.MathUtils;
/**
* Implements one-way ANOVA (analysis of variance) statistics.
@ -131,6 +132,82 @@ public class OneWayAnova {
}
/**
* Computes the ANOVA P-value for a collection of {@link SummaryStatistics}.
*
* <p><strong>Preconditions</strong>: <ul>
* <li>The categoryData <code>Collection</code> must contain
* {@link SummaryStatistics}.</li>
* <li> There must be at least two {@link SummaryStatistics} in the
* <code>categoryData</code> collection and each of these statistics must
* contain at least two values.</li></ul></p><p>
* This implementation uses the
* {@link org.apache.commons.math3.distribution.FDistribution
* commons-math F Distribution implementation} to estimate the exact
* p-value, using the formula<pre>
* p = 1 - cumulativeProbability(F)</pre>
* where <code>F</code> is the F value and <code>cumulativeProbability</code>
* is the commons-math implementation of the F distribution.</p>
*
* @param categoryData <code>Collection</code> of {@link SummaryStatistics}
* each containing data for one category
* @param allowOneElementData if true, allow computation for one catagory
* only or for one data element per category
* @return Pvalue
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws DimensionMismatchException if the length of the <code>categoryData</code>
* array is less than 2 or a contained {@link SummaryStatistics} does not have
* at least two values
* @throws ConvergenceException if the p-value can not be computed due to a convergence error
* @throws MaxCountExceededException if the maximum number of iterations is exceeded
*/
public double anovaPValue(final Collection<SummaryStatistics> categoryData,
final boolean allowOneElementData)
throws NullArgumentException, DimensionMismatchException,
ConvergenceException, MaxCountExceededException {
final AnovaStats a = anovaStats(categoryData, allowOneElementData);
final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
return 1.0 - fdist.cumulativeProbability(a.F);
}
/**
* This method calls the method that actually does the calculations (except
* P-value).
*
* @param categoryData
* <code>Collection</code> of <code>double[]</code> arrays each
* containing data for one category
* @return computed AnovaStats
* @throws NullArgumentException
* if <code>categoryData</code> is <code>null</code>
* @throws DimensionMismatchException
* if the length of the <code>categoryData</code> array is less
* than 2 or a contained <code>double[]</code> array does not
* contain at least two values
*/
private AnovaStats anovaStats(final Collection<double[]> categoryData)
throws NullArgumentException, DimensionMismatchException {
MathUtils.checkNotNull(categoryData);
final Collection<SummaryStatistics> categoryDataSummaryStatistics =
new ArrayList<SummaryStatistics>(categoryData.size());
// convert arrays to SummaryStatistics
for (final double[] data : categoryData) {
final SummaryStatistics dataSummaryStatistics = new SummaryStatistics();
categoryDataSummaryStatistics.add(dataSummaryStatistics);
for (final double val : data) {
dataSummaryStatistics.addValue(val);
}
}
return anovaStats(categoryDataSummaryStatistics, false);
}
/**
* Performs an ANOVA test, evaluating the null hypothesis that there
* is no difference among the means of the data categories.
@ -184,73 +261,65 @@ public class OneWayAnova {
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @param allowOneElementData if true, allow computation for one catagory
* only or for one data element per category
* @return computed AnovaStats
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws DimensionMismatchException if the length of the <code>categoryData</code>
* array is less than 2 or a contained <code>double[]</code> array does not contain
* @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of
* categories is less than 2 or a contained SummaryStatistics does not contain
* at least two values
*/
private AnovaStats anovaStats(final Collection<double[]> categoryData)
private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData,
final boolean allowOneElementData)
throws NullArgumentException, DimensionMismatchException {
if (categoryData == null) {
throw new NullArgumentException();
}
MathUtils.checkNotNull(categoryData);
// check if we have enough categories
if (categoryData.size() < 2) {
throw new DimensionMismatchException(
LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
categoryData.size(), 2);
}
if (!allowOneElementData) {
// check if we have enough categories
if (categoryData.size() < 2) {
throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
categoryData.size(), 2);
}
// check if each category has enough data and all is double[]
for (double[] array : categoryData) {
if (array.length <= 1) {
throw new DimensionMismatchException(
LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
array.length, 2);
// check if each category has enough data
for (final SummaryStatistics array : categoryData) {
if (array.getN() <= 1) {
throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
(int) array.getN(), 2);
}
}
}
int dfwg = 0;
double sswg = 0;
Sum totsum = new Sum();
SumOfSquares totsumsq = new SumOfSquares();
double totsum = 0;
double totsumsq = 0;
int totnum = 0;
for (double[] data : categoryData) {
for (final SummaryStatistics data : categoryData) {
Sum sum = new Sum();
SumOfSquares sumsq = new SumOfSquares();
int num = 0;
final double sum = data.getSum();
final double sumsq = data.getSumsq();
final int num = (int) data.getN();
totnum += num;
totsum += sum;
totsumsq += sumsq;
for (int i = 0; i < data.length; i++) {
double val = data[i];
// within category
num++;
sum.increment(val);
sumsq.increment(val);
// for all categories
totnum++;
totsum.increment(val);
totsumsq.increment(val);
}
dfwg += num - 1;
double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
final double ss = sumsq - ((sum * sum) / num);
sswg += ss;
}
double sst = totsumsq.getResult() - totsum.getResult() *
totsum.getResult()/totnum;
double ssbg = sst - sswg;
int dfbg = categoryData.size() - 1;
double msbg = ssbg/dfbg;
double mswg = sswg/dfwg;
double F = msbg/mswg;
final double sst = totsumsq - ((totsum * totsum) / totnum);
final double ssbg = sst - sswg;
final int dfbg = categoryData.size() - 1;
final double msbg = ssbg / dfbg;
final double mswg = sswg / dfwg;
final double F = msbg / mswg;
return new AnovaStats(dfbg, dfwg, F);
}
/**

View File

@ -20,6 +20,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.commons.math3.exception.MathIllegalArgumentException;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.junit.Assert;
import org.junit.Test;
@ -102,6 +103,38 @@ public class OneWayAnovaTest {
}
@Test
public void testAnovaPValueSummaryStatistics() {
// Target comparison values computed using R version 2.6.0 (Linux version)
List<SummaryStatistics> threeClasses = new ArrayList<SummaryStatistics>();
SummaryStatistics statsA = new SummaryStatistics();
for (double a : classA) {
statsA.addValue(a);
}
threeClasses.add(statsA);
SummaryStatistics statsB = new SummaryStatistics();
for (double b : classB) {
statsB.addValue(b);
}
threeClasses.add(statsB);
SummaryStatistics statsC = new SummaryStatistics();
for (double c : classC) {
statsC.addValue(c);
}
threeClasses.add(statsC);
Assert.assertEquals("ANOVA P-value", 6.959446E-06,
testStatistic.anovaPValue(threeClasses, true), 1E-12);
List<SummaryStatistics> twoClasses = new ArrayList<SummaryStatistics>();
twoClasses.add(statsA);
twoClasses.add(statsB);
Assert.assertEquals("ANOVA P-value", 0.904212960464,
testStatistic.anovaPValue(twoClasses, false), 1E-12);
}
@Test
public void testAnovaTest() {
// Target comparison values computed using R version 2.3.1 (Linux version)