Changed Mean.evaluate() to use a two-pass algorithm, improving accuracy

by exploiting the the fact that this method has access to the full
array of data values.


git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@602306 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2007-12-08 02:59:53 +00:00
parent 8583cdfe79
commit 458abe99c2
3 changed files with 64 additions and 39 deletions

View File

@ -22,24 +22,32 @@ import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStati
import org.apache.commons.math.stat.descriptive.summary.Sum;
/**
* Returns the arithmetic mean of the available values. Uses the definitional
* formula:
* <p>Computes the arithmetic mean of a set of values. Uses the definitional
* formula:</p>
* <p>
* mean = sum(x_i) / n
* <p>
* where <code>n</code> is the number of observations.
* <p>
* The value of the statistic is computed using the following recursive
* updating algorithm:
* <p>
* </p>
* <p>where <code>n</code> is the number of observations.
* </p>
* <p>When {@link #increment(double)} is used to add data incrementally from a
* stream of (unstored) values, the value of the statistic that
* {@link #getResult()} returns is computed using the following recursive
* updating algorithm: </p>
* <ol>
* <li>Initialize <code>m = </code> the first value</li>
* <li>For each additional value, update using <br>
* <code>m = m + (new value - m) / (number of observations)</code></li>
* </ol>
* <p> If {@link #evaluate(double[])} is used to compute the mean of an array
* of stored values, a two-pass, corrected algorithm is used, starting with
* the definitional formula computed using the array of stored values and then
* correcting this by adding the mean deviation of the data values from the
* arithmetic mean. See, e.g. "Comparison of Several Algorithms for Computing
* Sample Means and Variances," Robert F. Ling, Journal of the American
* Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp. 859-866. </p>
* <p>
* Returns <code>Double.NaN</code> if the dataset is empty.
* <p>
* </p>
* <strong>Note that this implementation is not synchronized.</strong> If
* multiple threads access an instance of this class concurrently, and at least
* one of the threads invokes the <code>increment()</code> or
@ -131,7 +139,17 @@ public class Mean extends AbstractStorelessUnivariateStatistic
public double evaluate(final double[] values,final int begin, final int length) {
if (test(values, begin, length)) {
Sum sum = new Sum();
return sum.evaluate(values, begin, length) / ((double) length);
double sampleSize = (double) length;
// Compute initial estimate using definitional formula
double xbar = sum.evaluate(values, begin, length) / sampleSize;
// Compute correction factor in second pass
double correction = 0;
for (int i = begin; i < begin + length; i++) {
correction += (values[i] - xbar);
}
return xbar + (correction/sampleSize);
}
return Double.NaN;
}

View File

@ -61,57 +61,59 @@ public class CertifiedDataTest extends TestCase {
}
/**
* Test StorelessDescriptiveStatistics
* Test SummaryStatistics - implementations that do not store the data
* and use single pass algorithms to compute statistics
*/
public void testUnivariateImpl() throws Exception {
public void testSummaryStatistics() throws Exception {
SummaryStatistics u = SummaryStatistics.newInstance(SummaryStatisticsImpl.class);
loadStats("data/PiDigits.txt", u);
assertEquals("PiDigits: std", std, u.getStandardDeviation(), .0000000000001);
assertEquals("PiDigits: mean", mean, u.getMean(), .0000000000001);
assertEquals("PiDigits: std", std, u.getStandardDeviation(), 1E-13);
assertEquals("PiDigits: mean", mean, u.getMean(), 1E-13);
loadStats("data/Mavro.txt", u);
assertEquals("Mavro: std", std, u.getStandardDeviation(), .00000000000001);
assertEquals("Mavro: mean", mean, u.getMean(), .00000000000001);
assertEquals("Mavro: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("Mavro: mean", mean, u.getMean(), 1E-14);
//loadStats("data/Michelso.txt");
//assertEquals("Michelso: std", std, u.getStandardDeviation(), .00000000000001);
//assertEquals("Michelso: mean", mean, u.getMean(), .00000000000001);
loadStats("data/Michelso.txt", u);
assertEquals("Michelso: std", std, u.getStandardDeviation(), 1E-13);
assertEquals("Michelso: mean", mean, u.getMean(), 1E-13);
loadStats("data/NumAcc1.txt", u);
assertEquals("NumAcc1: std", std, u.getStandardDeviation(), .00000000000001);
assertEquals("NumAcc1: mean", mean, u.getMean(), .00000000000001);
assertEquals("NumAcc1: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("NumAcc1: mean", mean, u.getMean(), 1E-14);
//loadStats("data/NumAcc2.txt");
//assertEquals("NumAcc2: std", std, u.getStandardDeviation(), .000000001);
//assertEquals("NumAcc2: mean", mean, u.getMean(), .00000000000001);
loadStats("data/NumAcc2.txt", u);
assertEquals("NumAcc2: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("NumAcc2: mean", mean, u.getMean(), 1E-14);
}
/**
* Test StorelessDescriptiveStatistics
* Test DescriptiveStatistics - implementations that store full array of
* values and execute multi-pass algorithms
*/
public void testStoredUnivariateImpl() throws Exception {
public void testDescriptiveStatistics() throws Exception {
DescriptiveStatistics u = DescriptiveStatistics.newInstance();
loadStats("data/PiDigits.txt", u);
assertEquals("PiDigits: std", std, u.getStandardDeviation(), .0000000000001);
assertEquals("PiDigits: mean", mean, u.getMean(), .0000000000001);
assertEquals("PiDigits: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("PiDigits: mean", mean, u.getMean(), 1E-14);
loadStats("data/Mavro.txt", u);
assertEquals("Mavro: std", std, u.getStandardDeviation(), .00000000000001);
assertEquals("Mavro: mean", mean, u.getMean(), .00000000000001);
assertEquals("Mavro: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("Mavro: mean", mean, u.getMean(), 1E-14);
//loadStats("data/Michelso.txt");
//assertEquals("Michelso: std", std, u.getStandardDeviation(), .00000000000001);
//assertEquals("Michelso: mean", mean, u.getMean(), .00000000000001);
loadStats("data/Michelso.txt", u);
assertEquals("Michelso: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("Michelso: mean", mean, u.getMean(), 1E-14);
loadStats("data/NumAcc1.txt", u);
assertEquals("NumAcc1: std", std, u.getStandardDeviation(), .00000000000001);
assertEquals("NumAcc1: mean", mean, u.getMean(), .00000000000001);
assertEquals("NumAcc1: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("NumAcc1: mean", mean, u.getMean(), 1E-14);
//loadStats("data/NumAcc2.txt");
//assertEquals("NumAcc2: std", std, u.getStandardDeviation(), .000000001);
//assertEquals("NumAcc2: mean", mean, u.getMean(), .00000000000001);
loadStats("data/NumAcc2.txt", u);
assertEquals("NumAcc2: std", std, u.getStandardDeviation(), 1E-14);
assertEquals("NumAcc2: mean", mean, u.getMean(), 1E-14);
}
/**

View File

@ -112,6 +112,11 @@ Commons Math Release Notes</title>
from DescriptiveStatisticsImpl, SummaryStatisticsImpl. Made
implementations of statistics configurable via setters.
</action>
<action dev="psteitz" type="fix" issue="MATH-174">
Changed Mean.evaluate() to use a two-pass algorithm, improving accuracy
by exploiting the the fact that this method has access to the full
array of data values.
</action>
</release>
<release version="1.1" date="2005-12-17"
description="This is a maintenance release containing bug fixes and enhancements.