From 41eb3cb7b7156f70bb254c65e3b798e775c96d4f Mon Sep 17 00:00:00 2001 From: Phil Steitz Date: Sat, 26 Jan 2008 22:42:43 +0000 Subject: [PATCH] Added one-way ANOVA implementation. JIRA: MATH-173 Contributed by Bob MacCallum. git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@615531 13f79535-47bb-0310-9956-ffa450edef68 --- pom.xml | 3 + .../math/stat/inference/OneWayAnova.java | 102 +++++++++ .../math/stat/inference/OneWayAnovaImpl.java | 208 ++++++++++++++++++ src/test/R/anovaTestCases | 72 ++++++ src/test/R/testAll | 1 + .../math/stat/inference/OneWayAnovaTest.java | 143 ++++++++++++ xdocs/changes.xml | 5 +- 7 files changed, 533 insertions(+), 1 deletion(-) create mode 100644 src/java/org/apache/commons/math/stat/inference/OneWayAnova.java create mode 100644 src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java create mode 100644 src/test/R/anovaTestCases create mode 100644 src/test/org/apache/commons/math/stat/inference/OneWayAnovaTest.java diff --git a/pom.xml b/pom.xml index 1f1fbe423..c325cb2b8 100644 --- a/pom.xml +++ b/pom.xml @@ -118,6 +118,9 @@ Piotr Kochanski + + Bob MacCallum + Fredrik Norin diff --git a/src/java/org/apache/commons/math/stat/inference/OneWayAnova.java b/src/java/org/apache/commons/math/stat/inference/OneWayAnova.java new file mode 100644 index 000000000..bce9a1603 --- /dev/null +++ b/src/java/org/apache/commons/math/stat/inference/OneWayAnova.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; +import java.util.Collection; + +/** + * An interface for one-way ANOVA (analysis of variance). + * + *

Tests for differences between two or more categories of univariate data + * (for example, the body mass index of accountants, lawyers, doctors and + * computer programmers). When two categories are given, this is equivalent to + * the {@link org.apache.commons.math.stat.inference.TTest}. + *

+ * + * @since 1.2 + * @version $Revision$ $Date$ + */ +public interface OneWayAnova { + /** + * Computes the ANOVA F-value for a collection of double[] + * arrays. + * + *

Preconditions:

+ * + * @param categoryData Collection of double[] + * arrays each containing data for one category + * @return Fvalue + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + public double anovaFValue(Collection categoryData) + throws IllegalArgumentException, MathException; + + /** + * Computes the ANOVA P-value for a collection of double[] + * arrays. + * + *

Preconditions:

+ * + * @param categoryData Collection of double[] + * arrays each containing data for one category + * @return Pvalue + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + public double anovaPValue(Collection categoryData) + throws IllegalArgumentException, MathException; + + /** + * Performs an ANOVA test, evaluating the null hypothesis that there + * is no difference among the means of the data categories. + * + *

Preconditions:

+ * + * @param categoryData Collection of double[] + * arrays each containing data for one category + * @param alpha significance level of the test + * @return true if the null hypothesis can be rejected with + * confidence 1 - alpha + * @throws IllegalArgumentException if the preconditions are not met + * @throws MathException if the statistic can not be computed do to a + * convergence or other numerical error. + */ + public boolean anovaTest(Collection categoryData, double alpha) + throws IllegalArgumentException, MathException; + +} \ No newline at end of file diff --git a/src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java b/src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java new file mode 100644 index 000000000..fc4dc82b4 --- /dev/null +++ b/src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import org.apache.commons.math.MathException; +import org.apache.commons.math.stat.descriptive.summary.Sum; +import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; + +import org.apache.commons.math.distribution.FDistribution; +import org.apache.commons.math.distribution.FDistributionImpl; + +import java.util.Collection; +import java.util.Iterator; + + +/** + * Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl} + * interface. + * + *

Uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate exact p-values.

+ * + *

This implementation is based on a description at + * http://faculty.vassar.edu/lowry/ch13pt1.html

+ *
+ * Abbreviations: bg = between groups,
+ *                wg = within groups,
+ *                ss = sum squared deviations
+ * 
+ * + * @since 1.2 + * @version $Revision$ $Date$ + */ +public class OneWayAnovaImpl implements OneWayAnova { + + /** + * Default constructor. + */ + public OneWayAnovaImpl() { + } + + /** + * {@inheritDoc}

+ * This implementation computes the F statistic using the definitional + * formula

+     *   F = msbg/mswg
+ * where
+     *  msbg = between group mean square
+     *  mswg = within group mean square
+ * are as defined + * here

+ */ + public double anovaFValue(Collection categoryData) + throws IllegalArgumentException, MathException { + AnovaStats a = anovaStats(categoryData); + return a.F; + } + + /** + * {@inheritDoc}

+ * This implementation uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate the exact + * p-value, using the formula

+     *   p = 1 - cumulativeProbability(F)
+ * where F is the F value and cumulativeProbability + * is the commons-math implementation of the F distribution.

+ */ + public double anovaPValue(Collection categoryData) + throws IllegalArgumentException, MathException { + AnovaStats a = anovaStats(categoryData); + FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg); + return 1.0 - fdist.cumulativeProbability(a.F); + } + + /** + * {@inheritDoc}

+ * This implementation uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate the exact + * p-value, using the formula

+     *   p = 1 - cumulativeProbability(F)
+ * where F is the F value and cumulativeProbability + * is the commons-math implementation of the F distribution.

+ *

True is returned iff the estimated p-value is less than alpha.

+ */ + public boolean anovaTest(Collection categoryData, double alpha) + throws IllegalArgumentException, MathException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw new IllegalArgumentException("bad significance level: " + alpha); + } + return (anovaPValue(categoryData) < alpha); + } + + + /** + * This method actually does the calculations (except P-value). + * + * @param categoryData Collection of double[] + * arrays each containing data for one category + * @return computed AnovaStats + * @throws IllegalArgumentException if categoryData does not meet + * preconditions specified in the interface definition + * @throws MathException if an error occurs computing the Anova stats + */ + private AnovaStats anovaStats(Collection categoryData) + throws IllegalArgumentException, MathException { + + // check if we have enough categories + if (categoryData.size() < 2) { + throw new IllegalArgumentException( + "ANOVA: two or more categories required"); + } + + // check if each category has enough data and all is double[] + for (Iterator iterator = categoryData.iterator(); iterator.hasNext();) { + double[] array; + try { + array = (double[])iterator.next(); + } catch (Exception ex) { + throw new IllegalArgumentException( + "ANOVA: categoryData contains non-double[] elements.", ex); + } + if (array.length <= 1) { + throw new IllegalArgumentException( + "ANOVA: one element of categoryData has fewer than 2 values."); + } + } + + int dfwg = 0; + double sswg = 0; + Sum totsum = new Sum(); + SumOfSquares totsumsq = new SumOfSquares(); + int totnum = 0; + + for (Iterator iterator = categoryData.iterator(); iterator.hasNext();) { + double[] data = (double[])iterator.next(); + + Sum sum = new Sum(); + SumOfSquares sumsq = new SumOfSquares(); + int num = 0; + + for (int i = 0; i < data.length; i++) { + double val = data[i]; + + // within category + num++; + sum.increment(val); + sumsq.increment(val); + + // for all categories + totnum++; + totsum.increment(val); + totsumsq.increment(val); + } + dfwg += num - 1; + double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num; + sswg += ss; + } + double sst = totsumsq.getResult() - totsum.getResult() * + totsum.getResult()/totnum; + double ssbg = sst - sswg; + int dfbg = categoryData.size() - 1; + double msbg = ssbg/dfbg; + double mswg = sswg/dfwg; + double F = msbg/mswg; + + return new AnovaStats(dfbg, dfwg, F); + } + + /** + Convenience class to pass dfbg,dfwg,F values around within AnovaImpl. + No get/set methods provided. + */ + private static class AnovaStats { + private int dfbg; + private int dfwg; + private double F; + + /** + * Constructor + * @param dfbg degrees of freedom in numerator (between groups) + * @param dfwg degrees of freedom in denominator (within groups) + * @param F statistic + */ + AnovaStats(int dfbg, int dfwg, double F) { + this.dfbg = dfbg; + this.dfwg = dfwg; + this.F = F; + } + } + +} \ No newline at end of file diff --git a/src/test/R/anovaTestCases b/src/test/R/anovaTestCases new file mode 100644 index 000000000..077ba098a --- /dev/null +++ b/src/test/R/anovaTestCases @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +#------------------------------------------------------------------------------ +# R source file to validate Binomial distribution tests in +# org.apache.commons.math.distribution.BinomialDistributionTest +# +# To run the test, install R, put this file and testFunctions +# into the same directory, launch R from this directory and then enter +# source("") +# +# R functions used +# anova(model) <- anova +# lm(frame) <- linear model +#------------------------------------------------------------------------------ +tol <- 1E-12 # error tolerance for tests +#------------------------------------------------------------------------------ +# Function definitions + +source("testFunctions") # utility test functions +options(digits=16) # override number of digits displayed + +# function to verify anova computations + +verifyAnova <- function(frame, expectedP, expectedF, frameName) { + a <- anova(lm(frame)) + p <- a$"Pr(>F)"[1] + f <- a$"F value"[1] + output <- c("P-value test frame = ", frameName) + if (assertEquals(expectedP,p,tol,"P value")) { + displayPadded(output, SUCCEEDED, WIDTH) + } else { + displayPadded(output, FAILED, WIDTH) + } + output <- c("F-value test frame = ", frameName) + if (assertEquals(expectedF,f,tol,"F value")) { + displayPadded(output, SUCCEEDED, WIDTH) + } else { + displayPadded(output, FAILED, WIDTH) + } +} + +#-------------------------------------------------------------------------- +cat("Anova test cases\n") +classA <- c(93.0, 103.0, 95.0, 101.0, 91.0, 105.0, 96.0, 94.0, 101.0) +classB <- c(99.0, 92.0, 102.0, 100.0, 102.0, 89.0) +classC <- c(110.0, 115.0, 111.0, 117.0, 128.0, 117.0) + +threeClasses = data.frame(val = c(classA, classB, classC), +class=c(rep("classA", length(classA)), + rep("classB", length(classB)), + rep("classC", length(classC)))) + +verifyAnova(threeClasses,6.959446e-06, 24.67361709460624, "Three classes") + +twoClasses = data.frame(val = c(classA, classB), +class=c(rep("classA", length(classA)), rep("classB", length(classB)))) +verifyAnova(twoClasses, 0.904212960464, 0.0150579150579, "Two classes") + +displayDashes(WIDTH) \ No newline at end of file diff --git a/src/test/R/testAll b/src/test/R/testAll index 7c5137b6a..2ac1f5388 100644 --- a/src/test/R/testAll +++ b/src/test/R/testAll @@ -39,6 +39,7 @@ source("regressionTestCases") # inference source("chiSquareTestCases") +source("anovaTestCases") # descriptive source("descriptiveTestCases") diff --git a/src/test/org/apache/commons/math/stat/inference/OneWayAnovaTest.java b/src/test/org/apache/commons/math/stat/inference/OneWayAnovaTest.java new file mode 100644 index 000000000..d4778f1d7 --- /dev/null +++ b/src/test/org/apache/commons/math/stat/inference/OneWayAnovaTest.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math.stat.inference; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +import java.util.ArrayList; +import java.util.List; + +/** + * Test cases for the OneWayAnovaImpl class. + * + * @version $Revision$ $Date$ + */ + +public class OneWayAnovaTest extends TestCase { + + protected OneWayAnova testStatistic = new OneWayAnovaImpl(); + + private char[] wrongArray = { 'a', 'b', 'c' }; + private double[] emptyArray = {}; + + private double[] classA = + {93.0, 103.0, 95.0, 101.0, 91.0, 105.0, 96.0, 94.0, 101.0 }; + private double[] classB = + {99.0, 92.0, 102.0, 100.0, 102.0, 89.0 }; + private double[] classC = + {110.0, 115.0, 111.0, 117.0, 128.0, 117.0 }; + + public OneWayAnovaTest(String name) { + super(name); + } + + public void setUp() { + } + + public static Test suite() { + TestSuite suite = new TestSuite(OneWayAnovaTest.class); + suite.setName("TestStatistic Tests"); + return suite; + } + + public void testAnovaFValue() throws Exception { + // Target comparison values computed using R version 2.6.0 (Linux version) + List threeClasses = new ArrayList(); + threeClasses.add(classA); + threeClasses.add(classB); + threeClasses.add(classC); + + assertEquals("ANOVA F-value", 24.67361709460624, + testStatistic.anovaFValue(threeClasses), 1E-12); + + List twoClasses = new ArrayList(); + twoClasses.add(classA); + twoClasses.add(classB); + + assertEquals("ANOVA F-value", 0.0150579150579, + testStatistic.anovaFValue(twoClasses), 1E-12); + + // now try some input hashes which should fail + List wrongContents = new ArrayList(); + wrongContents.add(classC); + wrongContents.add(wrongArray); + try { + double bad = testStatistic.anovaFValue(wrongContents); + fail("non double[] hash value for key classX, IllegalArgumentException expected"); + } catch (IllegalArgumentException ex) { + // expected + } + + List emptyContents = new ArrayList(); + emptyContents.add(emptyArray); + emptyContents.add(classC); + try { + double bad = testStatistic.anovaFValue(emptyContents); + fail("empty array for key classX, IllegalArgumentException expected"); + } catch (IllegalArgumentException ex) { + // expected + } + + List tooFew = new ArrayList(); + tooFew.add(classA); + try { + double bad = testStatistic.anovaFValue(tooFew); + fail("less than two classes, IllegalArgumentException expected"); + } catch (IllegalArgumentException ex) { + // expected + } + } + + + public void testAnovaPValue() throws Exception { + // Target comparison values computed using R version 2.6.0 (Linux version) + List threeClasses = new ArrayList(); + threeClasses.add(classA); + threeClasses.add(classB); + threeClasses.add(classC); + + assertEquals("ANOVA P-value", 6.959446E-06, + testStatistic.anovaPValue(threeClasses), 1E-12); + + List twoClasses = new ArrayList(); + twoClasses.add(classA); + twoClasses.add(classB); + + assertEquals("ANOVA P-value", 0.904212960464, + testStatistic.anovaPValue(twoClasses), 1E-12); + + } + + public void testAnovaTest() throws Exception { + // Target comparison values computed using R version 2.3.1 (Linux version) + List threeClasses = new ArrayList(); + threeClasses.add(classA); + threeClasses.add(classB); + threeClasses.add(classC); + + assertTrue("ANOVA Test P<0.01", testStatistic.anovaTest(threeClasses, 0.01)); + + List twoClasses = new ArrayList(); + twoClasses.add(classA); + twoClasses.add(classB); + + assertFalse("ANOVA Test P>0.01", testStatistic.anovaTest(twoClasses, 0.01)); + } + +} \ No newline at end of file diff --git a/xdocs/changes.xml b/xdocs/changes.xml index 6acb88250..f2501f94f 100644 --- a/xdocs/changes.xml +++ b/xdocs/changes.xml @@ -129,7 +129,10 @@ Commons Math Release Notes Add tests for Fraction constructor using double parameter. - + + + Added one-way ANOVA implementation. +