Added one-way ANOVA implementation.

JIRA: MATH-173
Contributed by Bob MacCallum.


git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@615531 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2008-01-26 22:42:43 +00:00
parent 39c77b17fd
commit 41eb3cb7b7
7 changed files with 533 additions and 1 deletions

View File

@ -118,6 +118,9 @@
<contributor>
<name>Piotr Kochanski</name>
</contributor>
<contributor>
<name>Bob MacCallum</name>
</contributor>
<contributor>
<name>Fredrik Norin</name>
</contributor>

View File

@ -0,0 +1,102 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.stat.inference;
import org.apache.commons.math.MathException;
import java.util.Collection;
/**
* An interface for one-way ANOVA (analysis of variance).
*
* <p> Tests for differences between two or more categories of univariate data
* (for example, the body mass index of accountants, lawyers, doctors and
* computer programmers). When two categories are given, this is equivalent to
* the {@link org.apache.commons.math.stat.inference.TTest}.
* </p>
*
* @since 1.2
* @version $Revision$ $Date$
*/
public interface OneWayAnova {
/**
* Computes the ANOVA F-value for a collection of <code>double[]</code>
* arrays.
*
* <p><strong>Preconditions</strong>: <ul>
* <li>The categoryData <code>Collection</code> must contain
* <code>double[]</code> arrays.</li>
* <li> There must be at least two <code>double[]</code> arrays in the
* <code>categoryData</code> collection and each of these arrays must
* contain at least two values.</li></ul></p>
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @return Fvalue
* @throws IllegalArgumentException if the preconditions are not met
* @throws MathException if the statistic can not be computed do to a
* convergence or other numerical error.
*/
public double anovaFValue(Collection categoryData)
throws IllegalArgumentException, MathException;
/**
* Computes the ANOVA P-value for a collection of <code>double[]</code>
* arrays.
*
* <p><strong>Preconditions</strong>: <ul>
* <li>The categoryData <code>Collection</code> must contain
* <code>double[]</code> arrays.</li>
* <li> There must be at least two <code>double[]</code> arrays in the
* <code>categoryData</code> collection and each of these arrays must
* contain at least two values.</li></ul></p>
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @return Pvalue
* @throws IllegalArgumentException if the preconditions are not met
* @throws MathException if the statistic can not be computed do to a
* convergence or other numerical error.
*/
public double anovaPValue(Collection categoryData)
throws IllegalArgumentException, MathException;
/**
* Performs an ANOVA test, evaluating the null hypothesis that there
* is no difference among the means of the data categories.
*
* <p><strong>Preconditions</strong>: <ul>
* <li>The categoryData <code>Collection</code> must contain
* <code>double[]</code> arrays.</li>
* <li> There must be at least two <code>double[]</code> arrays in the
* <code>categoryData</code> collection and each of these arrays must
* contain at least two values.</li>
* <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
* </li></ul></p>
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @param alpha significance level of the test
* @return true if the null hypothesis can be rejected with
* confidence 1 - alpha
* @throws IllegalArgumentException if the preconditions are not met
* @throws MathException if the statistic can not be computed do to a
* convergence or other numerical error.
*/
public boolean anovaTest(Collection categoryData, double alpha)
throws IllegalArgumentException, MathException;
}

View File

@ -0,0 +1,208 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.stat.inference;
import org.apache.commons.math.MathException;
import org.apache.commons.math.stat.descriptive.summary.Sum;
import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
import org.apache.commons.math.distribution.FDistribution;
import org.apache.commons.math.distribution.FDistributionImpl;
import java.util.Collection;
import java.util.Iterator;
/**
* Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl}
* interface.
*
* <p>Uses the
* {@link org.apache.commons.math.distribution.FDistribution
* commons-math F Distribution implementation} to estimate exact p-values.</p>
*
* <p>This implementation is based on a description at
* http://faculty.vassar.edu/lowry/ch13pt1.html</p>
* <pre>
* Abbreviations: bg = between groups,
* wg = within groups,
* ss = sum squared deviations
* </pre>
*
* @since 1.2
* @version $Revision$ $Date$
*/
public class OneWayAnovaImpl implements OneWayAnova {
/**
* Default constructor.
*/
public OneWayAnovaImpl() {
}
/**
* {@inheritDoc}<p>
* This implementation computes the F statistic using the definitional
* formula<pre>
* F = msbg/mswg</pre>
* where<pre>
* msbg = between group mean square
* mswg = within group mean square</pre>
* are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
* here</a></p>
*/
public double anovaFValue(Collection categoryData)
throws IllegalArgumentException, MathException {
AnovaStats a = anovaStats(categoryData);
return a.F;
}
/**
* {@inheritDoc}<p>
* This implementation uses the
* {@link org.apache.commons.math.distribution.FDistribution
* commons-math F Distribution implementation} to estimate the exact
* p-value, using the formula<pre>
* p = 1 - cumulativeProbability(F)</pre>
* where <code>F</code> is the F value and <code>cumulativeProbability</code>
* is the commons-math implementation of the F distribution.</p>
*/
public double anovaPValue(Collection categoryData)
throws IllegalArgumentException, MathException {
AnovaStats a = anovaStats(categoryData);
FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg);
return 1.0 - fdist.cumulativeProbability(a.F);
}
/**
* {@inheritDoc}<p>
* This implementation uses the
* {@link org.apache.commons.math.distribution.FDistribution
* commons-math F Distribution implementation} to estimate the exact
* p-value, using the formula<pre>
* p = 1 - cumulativeProbability(F)</pre>
* where <code>F</code> is the F value and <code>cumulativeProbability</code>
* is the commons-math implementation of the F distribution.</p>
* <p>True is returned iff the estimated p-value is less than alpha.</p>
*/
public boolean anovaTest(Collection categoryData, double alpha)
throws IllegalArgumentException, MathException {
if ((alpha <= 0) || (alpha > 0.5)) {
throw new IllegalArgumentException("bad significance level: " + alpha);
}
return (anovaPValue(categoryData) < alpha);
}
/**
* This method actually does the calculations (except P-value).
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @return computed AnovaStats
* @throws IllegalArgumentException if categoryData does not meet
* preconditions specified in the interface definition
* @throws MathException if an error occurs computing the Anova stats
*/
private AnovaStats anovaStats(Collection categoryData)
throws IllegalArgumentException, MathException {
// check if we have enough categories
if (categoryData.size() < 2) {
throw new IllegalArgumentException(
"ANOVA: two or more categories required");
}
// check if each category has enough data and all is double[]
for (Iterator iterator = categoryData.iterator(); iterator.hasNext();) {
double[] array;
try {
array = (double[])iterator.next();
} catch (Exception ex) {
throw new IllegalArgumentException(
"ANOVA: categoryData contains non-double[] elements.", ex);
}
if (array.length <= 1) {
throw new IllegalArgumentException(
"ANOVA: one element of categoryData has fewer than 2 values.");
}
}
int dfwg = 0;
double sswg = 0;
Sum totsum = new Sum();
SumOfSquares totsumsq = new SumOfSquares();
int totnum = 0;
for (Iterator iterator = categoryData.iterator(); iterator.hasNext();) {
double[] data = (double[])iterator.next();
Sum sum = new Sum();
SumOfSquares sumsq = new SumOfSquares();
int num = 0;
for (int i = 0; i < data.length; i++) {
double val = data[i];
// within category
num++;
sum.increment(val);
sumsq.increment(val);
// for all categories
totnum++;
totsum.increment(val);
totsumsq.increment(val);
}
dfwg += num - 1;
double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
sswg += ss;
}
double sst = totsumsq.getResult() - totsum.getResult() *
totsum.getResult()/totnum;
double ssbg = sst - sswg;
int dfbg = categoryData.size() - 1;
double msbg = ssbg/dfbg;
double mswg = sswg/dfwg;
double F = msbg/mswg;
return new AnovaStats(dfbg, dfwg, F);
}
/**
Convenience class to pass dfbg,dfwg,F values around within AnovaImpl.
No get/set methods provided.
*/
private static class AnovaStats {
private int dfbg;
private int dfwg;
private double F;
/**
* Constructor
* @param dfbg degrees of freedom in numerator (between groups)
* @param dfwg degrees of freedom in denominator (within groups)
* @param F statistic
*/
AnovaStats(int dfbg, int dfwg, double F) {
this.dfbg = dfbg;
this.dfwg = dfwg;
this.F = F;
}
}
}

72
src/test/R/anovaTestCases Normal file
View File

@ -0,0 +1,72 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#------------------------------------------------------------------------------
# R source file to validate Binomial distribution tests in
# org.apache.commons.math.distribution.BinomialDistributionTest
#
# To run the test, install R, put this file and testFunctions
# into the same directory, launch R from this directory and then enter
# source("<name-of-this-file>")
#
# R functions used
# anova(model) <- anova
# lm(frame) <- linear model
#------------------------------------------------------------------------------
tol <- 1E-12 # error tolerance for tests
#------------------------------------------------------------------------------
# Function definitions
source("testFunctions") # utility test functions
options(digits=16) # override number of digits displayed
# function to verify anova computations
verifyAnova <- function(frame, expectedP, expectedF, frameName) {
a <- anova(lm(frame))
p <- a$"Pr(>F)"[1]
f <- a$"F value"[1]
output <- c("P-value test frame = ", frameName)
if (assertEquals(expectedP,p,tol,"P value")) {
displayPadded(output, SUCCEEDED, WIDTH)
} else {
displayPadded(output, FAILED, WIDTH)
}
output <- c("F-value test frame = ", frameName)
if (assertEquals(expectedF,f,tol,"F value")) {
displayPadded(output, SUCCEEDED, WIDTH)
} else {
displayPadded(output, FAILED, WIDTH)
}
}
#--------------------------------------------------------------------------
cat("Anova test cases\n")
classA <- c(93.0, 103.0, 95.0, 101.0, 91.0, 105.0, 96.0, 94.0, 101.0)
classB <- c(99.0, 92.0, 102.0, 100.0, 102.0, 89.0)
classC <- c(110.0, 115.0, 111.0, 117.0, 128.0, 117.0)
threeClasses = data.frame(val = c(classA, classB, classC),
class=c(rep("classA", length(classA)),
rep("classB", length(classB)),
rep("classC", length(classC))))
verifyAnova(threeClasses,6.959446e-06, 24.67361709460624, "Three classes")
twoClasses = data.frame(val = c(classA, classB),
class=c(rep("classA", length(classA)), rep("classB", length(classB))))
verifyAnova(twoClasses, 0.904212960464, 0.0150579150579, "Two classes")
displayDashes(WIDTH)

View File

@ -39,6 +39,7 @@ source("regressionTestCases")
# inference
source("chiSquareTestCases")
source("anovaTestCases")
# descriptive
source("descriptiveTestCases")

View File

@ -0,0 +1,143 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.stat.inference;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import java.util.ArrayList;
import java.util.List;
/**
* Test cases for the OneWayAnovaImpl class.
*
* @version $Revision$ $Date$
*/
public class OneWayAnovaTest extends TestCase {
protected OneWayAnova testStatistic = new OneWayAnovaImpl();
private char[] wrongArray = { 'a', 'b', 'c' };
private double[] emptyArray = {};
private double[] classA =
{93.0, 103.0, 95.0, 101.0, 91.0, 105.0, 96.0, 94.0, 101.0 };
private double[] classB =
{99.0, 92.0, 102.0, 100.0, 102.0, 89.0 };
private double[] classC =
{110.0, 115.0, 111.0, 117.0, 128.0, 117.0 };
public OneWayAnovaTest(String name) {
super(name);
}
public void setUp() {
}
public static Test suite() {
TestSuite suite = new TestSuite(OneWayAnovaTest.class);
suite.setName("TestStatistic Tests");
return suite;
}
public void testAnovaFValue() throws Exception {
// Target comparison values computed using R version 2.6.0 (Linux version)
List threeClasses = new ArrayList();
threeClasses.add(classA);
threeClasses.add(classB);
threeClasses.add(classC);
assertEquals("ANOVA F-value", 24.67361709460624,
testStatistic.anovaFValue(threeClasses), 1E-12);
List twoClasses = new ArrayList();
twoClasses.add(classA);
twoClasses.add(classB);
assertEquals("ANOVA F-value", 0.0150579150579,
testStatistic.anovaFValue(twoClasses), 1E-12);
// now try some input hashes which should fail
List wrongContents = new ArrayList();
wrongContents.add(classC);
wrongContents.add(wrongArray);
try {
double bad = testStatistic.anovaFValue(wrongContents);
fail("non double[] hash value for key classX, IllegalArgumentException expected");
} catch (IllegalArgumentException ex) {
// expected
}
List emptyContents = new ArrayList();
emptyContents.add(emptyArray);
emptyContents.add(classC);
try {
double bad = testStatistic.anovaFValue(emptyContents);
fail("empty array for key classX, IllegalArgumentException expected");
} catch (IllegalArgumentException ex) {
// expected
}
List tooFew = new ArrayList();
tooFew.add(classA);
try {
double bad = testStatistic.anovaFValue(tooFew);
fail("less than two classes, IllegalArgumentException expected");
} catch (IllegalArgumentException ex) {
// expected
}
}
public void testAnovaPValue() throws Exception {
// Target comparison values computed using R version 2.6.0 (Linux version)
List threeClasses = new ArrayList();
threeClasses.add(classA);
threeClasses.add(classB);
threeClasses.add(classC);
assertEquals("ANOVA P-value", 6.959446E-06,
testStatistic.anovaPValue(threeClasses), 1E-12);
List twoClasses = new ArrayList();
twoClasses.add(classA);
twoClasses.add(classB);
assertEquals("ANOVA P-value", 0.904212960464,
testStatistic.anovaPValue(twoClasses), 1E-12);
}
public void testAnovaTest() throws Exception {
// Target comparison values computed using R version 2.3.1 (Linux version)
List threeClasses = new ArrayList();
threeClasses.add(classA);
threeClasses.add(classB);
threeClasses.add(classC);
assertTrue("ANOVA Test P<0.01", testStatistic.anovaTest(threeClasses, 0.01));
List twoClasses = new ArrayList();
twoClasses.add(classA);
twoClasses.add(classB);
assertFalse("ANOVA Test P>0.01", testStatistic.anovaTest(twoClasses, 0.01));
}
}

View File

@ -130,6 +130,9 @@ Commons Math Release Notes</title>
<action dev="luc" type="update" issue="MATH-179" due-to="Niall Pemberton">
Add tests for Fraction constructor using double parameter.
</action>
<action dev="psteitz" type="update" issue="MATH-173" due-to="Bob MacCallum">
Added one-way ANOVA implementation.
</action>
</release>
<release version="1.1" date="2005-12-17"
description="This is a maintenance release containing bug fixes and enhancements.