From 41eb3cb7b7156f70bb254c65e3b798e775c96d4f Mon Sep 17 00:00:00 2001
From: Phil Steitz Tests for differences between two or more categories of univariate data
+ * (for example, the body mass index of accountants, lawyers, doctors and
+ * computer programmers). When two categories are given, this is equivalent to
+ * the {@link org.apache.commons.math.stat.inference.TTest}.
+ * Preconditions: double[]
+ * arrays.
+ *
+ *
+ *
Collection
must contain
+ * double[]
arrays.double[]
arrays in the
+ * categoryData
collection and each of these arrays must
+ * contain at least two values.
Collection
of double[]
+ * arrays each containing data for one category
+ * @return Fvalue
+ * @throws IllegalArgumentException if the preconditions are not met
+ * @throws MathException if the statistic can not be computed do to a
+ * convergence or other numerical error.
+ */
+ public double anovaFValue(Collection categoryData)
+ throws IllegalArgumentException, MathException;
+
+ /**
+ * Computes the ANOVA P-value for a collection of double[]
+ * arrays.
+ *
+ * Preconditions:
Collection
must contain
+ * double[]
arrays.double[]
arrays in the
+ * categoryData
collection and each of these arrays must
+ * contain at least two values.Collection
of double[]
+ * arrays each containing data for one category
+ * @return Pvalue
+ * @throws IllegalArgumentException if the preconditions are not met
+ * @throws MathException if the statistic can not be computed do to a
+ * convergence or other numerical error.
+ */
+ public double anovaPValue(Collection categoryData)
+ throws IllegalArgumentException, MathException;
+
+ /**
+ * Performs an ANOVA test, evaluating the null hypothesis that there
+ * is no difference among the means of the data categories.
+ *
+ * Preconditions:
Collection
must contain
+ * double[]
arrays.double[]
arrays in the
+ * categoryData
collection and each of these arrays must
+ * contain at least two values.Collection
of double[]
+ * arrays each containing data for one category
+ * @param alpha significance level of the test
+ * @return true if the null hypothesis can be rejected with
+ * confidence 1 - alpha
+ * @throws IllegalArgumentException if the preconditions are not met
+ * @throws MathException if the statistic can not be computed do to a
+ * convergence or other numerical error.
+ */
+ public boolean anovaTest(Collection categoryData, double alpha)
+ throws IllegalArgumentException, MathException;
+
+}
\ No newline at end of file
diff --git a/src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java b/src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java
new file mode 100644
index 000000000..fc4dc82b4
--- /dev/null
+++ b/src/java/org/apache/commons/math/stat/inference/OneWayAnovaImpl.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.math.stat.inference;
+
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.stat.descriptive.summary.Sum;
+import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
+
+import org.apache.commons.math.distribution.FDistribution;
+import org.apache.commons.math.distribution.FDistributionImpl;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+
+/**
+ * Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl}
+ * interface.
+ *
+ * Uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate exact p-values.
+ * + *This implementation is based on a description at + * http://faculty.vassar.edu/lowry/ch13pt1.html
+ *+ * Abbreviations: bg = between groups, + * wg = within groups, + * ss = sum squared deviations + *+ * + * @since 1.2 + * @version $Revision$ $Date$ + */ +public class OneWayAnovaImpl implements OneWayAnova { + + /** + * Default constructor. + */ + public OneWayAnovaImpl() { + } + + /** + * {@inheritDoc}
+ * This implementation computes the F statistic using the definitional + * formula
+ * F = msbg/mswg+ * where
+ * msbg = between group mean square + * mswg = within group mean square+ * are as defined + * here + */ + public double anovaFValue(Collection categoryData) + throws IllegalArgumentException, MathException { + AnovaStats a = anovaStats(categoryData); + return a.F; + } + + /** + * {@inheritDoc}
+ * This implementation uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate the exact + * p-value, using the formula
+ * p = 1 - cumulativeProbability(F)+ * where
F
is the F value and cumulativeProbability
+ * is the commons-math implementation of the F distribution.
+ */
+ public double anovaPValue(Collection categoryData)
+ throws IllegalArgumentException, MathException {
+ AnovaStats a = anovaStats(categoryData);
+ FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg);
+ return 1.0 - fdist.cumulativeProbability(a.F);
+ }
+
+ /**
+ * {@inheritDoc}+ * This implementation uses the + * {@link org.apache.commons.math.distribution.FDistribution + * commons-math F Distribution implementation} to estimate the exact + * p-value, using the formula
+ * p = 1 - cumulativeProbability(F)+ * where
F
is the F value and cumulativeProbability
+ * is the commons-math implementation of the F distribution.
+ * True is returned iff the estimated p-value is less than alpha.
+ */ + public boolean anovaTest(Collection categoryData, double alpha) + throws IllegalArgumentException, MathException { + if ((alpha <= 0) || (alpha > 0.5)) { + throw new IllegalArgumentException("bad significance level: " + alpha); + } + return (anovaPValue(categoryData) < alpha); + } + + + /** + * This method actually does the calculations (except P-value). + * + * @param categoryDataCollection
of double[]
+ * arrays each containing data for one category
+ * @return computed AnovaStats
+ * @throws IllegalArgumentException if categoryData does not meet
+ * preconditions specified in the interface definition
+ * @throws MathException if an error occurs computing the Anova stats
+ */
+ private AnovaStats anovaStats(Collection categoryData)
+ throws IllegalArgumentException, MathException {
+
+ // check if we have enough categories
+ if (categoryData.size() < 2) {
+ throw new IllegalArgumentException(
+ "ANOVA: two or more categories required");
+ }
+
+ // check if each category has enough data and all is double[]
+ for (Iterator iterator = categoryData.iterator(); iterator.hasNext();) {
+ double[] array;
+ try {
+ array = (double[])iterator.next();
+ } catch (Exception ex) {
+ throw new IllegalArgumentException(
+ "ANOVA: categoryData contains non-double[] elements.", ex);
+ }
+ if (array.length <= 1) {
+ throw new IllegalArgumentException(
+ "ANOVA: one element of categoryData has fewer than 2 values.");
+ }
+ }
+
+ int dfwg = 0;
+ double sswg = 0;
+ Sum totsum = new Sum();
+ SumOfSquares totsumsq = new SumOfSquares();
+ int totnum = 0;
+
+ for (Iterator iterator = categoryData.iterator(); iterator.hasNext();) {
+ double[] data = (double[])iterator.next();
+
+ Sum sum = new Sum();
+ SumOfSquares sumsq = new SumOfSquares();
+ int num = 0;
+
+ for (int i = 0; i < data.length; i++) {
+ double val = data[i];
+
+ // within category
+ num++;
+ sum.increment(val);
+ sumsq.increment(val);
+
+ // for all categories
+ totnum++;
+ totsum.increment(val);
+ totsumsq.increment(val);
+ }
+ dfwg += num - 1;
+ double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
+ sswg += ss;
+ }
+ double sst = totsumsq.getResult() - totsum.getResult() *
+ totsum.getResult()/totnum;
+ double ssbg = sst - sswg;
+ int dfbg = categoryData.size() - 1;
+ double msbg = ssbg/dfbg;
+ double mswg = sswg/dfwg;
+ double F = msbg/mswg;
+
+ return new AnovaStats(dfbg, dfwg, F);
+ }
+
+ /**
+ Convenience class to pass dfbg,dfwg,F values around within AnovaImpl.
+ No get/set methods provided.
+ */
+ private static class AnovaStats {
+ private int dfbg;
+ private int dfwg;
+ private double F;
+
+ /**
+ * Constructor
+ * @param dfbg degrees of freedom in numerator (between groups)
+ * @param dfwg degrees of freedom in denominator (within groups)
+ * @param F statistic
+ */
+ AnovaStats(int dfbg, int dfwg, double F) {
+ this.dfbg = dfbg;
+ this.dfwg = dfwg;
+ this.F = F;
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/src/test/R/anovaTestCases b/src/test/R/anovaTestCases
new file mode 100644
index 000000000..077ba098a
--- /dev/null
+++ b/src/test/R/anovaTestCases
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#------------------------------------------------------------------------------
+# R source file to validate Binomial distribution tests in
+# org.apache.commons.math.distribution.BinomialDistributionTest
+#
+# To run the test, install R, put this file and testFunctions
+# into the same directory, launch R from this directory and then enter
+# source("