Changed Mean.evaluate() to use a two-pass algorithm, improving accuracy

by exploiting the the fact that this method has access to the full
array of data values.


git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@602306 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2007-12-08 02:59:53 +00:00
parent 8583cdfe79
commit 458abe99c2
3 changed files with 64 additions and 39 deletions

View File

@ -22,24 +22,32 @@ import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStati
import org.apache.commons.math.stat.descriptive.summary.Sum; import org.apache.commons.math.stat.descriptive.summary.Sum;
/** /**
* Returns the arithmetic mean of the available values. Uses the definitional * <p>Computes the arithmetic mean of a set of values. Uses the definitional
* formula: * formula:</p>
* <p> * <p>
* mean = sum(x_i) / n * mean = sum(x_i) / n
* <p> * </p>
* where <code>n</code> is the number of observations. * <p>where <code>n</code> is the number of observations.
* <p> * </p>
* The value of the statistic is computed using the following recursive * <p>When {@link #increment(double)} is used to add data incrementally from a
* updating algorithm: * stream of (unstored) values, the value of the statistic that
* <p> * {@link #getResult()} returns is computed using the following recursive
* updating algorithm: </p>
* <ol> * <ol>
* <li>Initialize <code>m = </code> the first value</li> * <li>Initialize <code>m = </code> the first value</li>
* <li>For each additional value, update using <br> * <li>For each additional value, update using <br>
* <code>m = m + (new value - m) / (number of observations)</code></li> * <code>m = m + (new value - m) / (number of observations)</code></li>
* </ol> * </ol>
* <p> If {@link #evaluate(double[])} is used to compute the mean of an array
* of stored values, a two-pass, corrected algorithm is used, starting with
* the definitional formula computed using the array of stored values and then
* correcting this by adding the mean deviation of the data values from the
* arithmetic mean. See, e.g. "Comparison of Several Algorithms for Computing
* Sample Means and Variances," Robert F. Ling, Journal of the American
* Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp. 859-866. </p>
* <p> * <p>
* Returns <code>Double.NaN</code> if the dataset is empty. * Returns <code>Double.NaN</code> if the dataset is empty.
* <p> * </p>
* <strong>Note that this implementation is not synchronized.</strong> If * <strong>Note that this implementation is not synchronized.</strong> If
* multiple threads access an instance of this class concurrently, and at least * multiple threads access an instance of this class concurrently, and at least
* one of the threads invokes the <code>increment()</code> or * one of the threads invokes the <code>increment()</code> or
@ -131,7 +139,17 @@ public class Mean extends AbstractStorelessUnivariateStatistic
public double evaluate(final double[] values,final int begin, final int length) { public double evaluate(final double[] values,final int begin, final int length) {
if (test(values, begin, length)) { if (test(values, begin, length)) {
Sum sum = new Sum(); Sum sum = new Sum();
return sum.evaluate(values, begin, length) / ((double) length); double sampleSize = (double) length;
// Compute initial estimate using definitional formula
double xbar = sum.evaluate(values, begin, length) / sampleSize;
// Compute correction factor in second pass
double correction = 0;
for (int i = begin; i < begin + length; i++) {
correction += (values[i] - xbar);
}
return xbar + (correction/sampleSize);
} }
return Double.NaN; return Double.NaN;
} }

View File

@ -61,57 +61,59 @@ public class CertifiedDataTest extends TestCase {
} }
/** /**
* Test StorelessDescriptiveStatistics * Test SummaryStatistics - implementations that do not store the data
* and use single pass algorithms to compute statistics
*/ */
public void testUnivariateImpl() throws Exception { public void testSummaryStatistics() throws Exception {
SummaryStatistics u = SummaryStatistics.newInstance(SummaryStatisticsImpl.class); SummaryStatistics u = SummaryStatistics.newInstance(SummaryStatisticsImpl.class);
loadStats("data/PiDigits.txt", u); loadStats("data/PiDigits.txt", u);
assertEquals("PiDigits: std", std, u.getStandardDeviation(), .0000000000001); assertEquals("PiDigits: std", std, u.getStandardDeviation(), 1E-13);
assertEquals("PiDigits: mean", mean, u.getMean(), .0000000000001); assertEquals("PiDigits: mean", mean, u.getMean(), 1E-13);
loadStats("data/Mavro.txt", u); loadStats("data/Mavro.txt", u);
assertEquals("Mavro: std", std, u.getStandardDeviation(), .00000000000001); assertEquals("Mavro: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("Mavro: mean", mean, u.getMean(), .00000000000001); assertEquals("Mavro: mean", mean, u.getMean(), 1E-14);
//loadStats("data/Michelso.txt"); loadStats("data/Michelso.txt", u);
//assertEquals("Michelso: std", std, u.getStandardDeviation(), .00000000000001); assertEquals("Michelso: std", std, u.getStandardDeviation(), 1E-13);
//assertEquals("Michelso: mean", mean, u.getMean(), .00000000000001); assertEquals("Michelso: mean", mean, u.getMean(), 1E-13);
loadStats("data/NumAcc1.txt", u); loadStats("data/NumAcc1.txt", u);
assertEquals("NumAcc1: std", std, u.getStandardDeviation(), .00000000000001); assertEquals("NumAcc1: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("NumAcc1: mean", mean, u.getMean(), .00000000000001); assertEquals("NumAcc1: mean", mean, u.getMean(), 1E-14);
//loadStats("data/NumAcc2.txt"); loadStats("data/NumAcc2.txt", u);
//assertEquals("NumAcc2: std", std, u.getStandardDeviation(), .000000001); assertEquals("NumAcc2: std", std, u.getStandardDeviation(), 1E-14);
//assertEquals("NumAcc2: mean", mean, u.getMean(), .00000000000001); assertEquals("NumAcc2: mean", mean, u.getMean(), 1E-14);
} }
/** /**
* Test StorelessDescriptiveStatistics * Test DescriptiveStatistics - implementations that store full array of
* values and execute multi-pass algorithms
*/ */
public void testStoredUnivariateImpl() throws Exception { public void testDescriptiveStatistics() throws Exception {
DescriptiveStatistics u = DescriptiveStatistics.newInstance(); DescriptiveStatistics u = DescriptiveStatistics.newInstance();
loadStats("data/PiDigits.txt", u); loadStats("data/PiDigits.txt", u);
assertEquals("PiDigits: std", std, u.getStandardDeviation(), .0000000000001); assertEquals("PiDigits: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("PiDigits: mean", mean, u.getMean(), .0000000000001); assertEquals("PiDigits: mean", mean, u.getMean(), 1E-14);
loadStats("data/Mavro.txt", u); loadStats("data/Mavro.txt", u);
assertEquals("Mavro: std", std, u.getStandardDeviation(), .00000000000001); assertEquals("Mavro: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("Mavro: mean", mean, u.getMean(), .00000000000001); assertEquals("Mavro: mean", mean, u.getMean(), 1E-14);
//loadStats("data/Michelso.txt"); loadStats("data/Michelso.txt", u);
//assertEquals("Michelso: std", std, u.getStandardDeviation(), .00000000000001); assertEquals("Michelso: std", std, u.getStandardDeviation(), 1E-14);
//assertEquals("Michelso: mean", mean, u.getMean(), .00000000000001); assertEquals("Michelso: mean", mean, u.getMean(), 1E-14);
loadStats("data/NumAcc1.txt", u); loadStats("data/NumAcc1.txt", u);
assertEquals("NumAcc1: std", std, u.getStandardDeviation(), .00000000000001); assertEquals("NumAcc1: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("NumAcc1: mean", mean, u.getMean(), .00000000000001); assertEquals("NumAcc1: mean", mean, u.getMean(), 1E-14);
//loadStats("data/NumAcc2.txt"); loadStats("data/NumAcc2.txt", u);
//assertEquals("NumAcc2: std", std, u.getStandardDeviation(), .000000001); assertEquals("NumAcc2: std", std, u.getStandardDeviation(), 1E-14);
//assertEquals("NumAcc2: mean", mean, u.getMean(), .00000000000001); assertEquals("NumAcc2: mean", mean, u.getMean(), 1E-14);
} }
/** /**

View File

@ -111,7 +111,12 @@ Commons Math Release Notes</title>
and SummaryStatistics concrete classes. Pushed implementations up and SummaryStatistics concrete classes. Pushed implementations up
from DescriptiveStatisticsImpl, SummaryStatisticsImpl. Made from DescriptiveStatisticsImpl, SummaryStatisticsImpl. Made
implementations of statistics configurable via setters. implementations of statistics configurable via setters.
</action> </action>
<action dev="psteitz" type="fix" issue="MATH-174">
Changed Mean.evaluate() to use a two-pass algorithm, improving accuracy
by exploiting the the fact that this method has access to the full
array of data values.
</action>
</release> </release>
<release version="1.1" date="2005-12-17" <release version="1.1" date="2005-12-17"
description="This is a maintenance release containing bug fixes and enhancements. description="This is a maintenance release containing bug fixes and enhancements.