HADOOP-13804. MutableStat mean loses accuracy if add(long, long) is used. Contributed by Erik Krogen.

(cherry picked from commit 3dbad5d823)
(cherry picked from commit b245e9ce2f)
This commit is contained in:
Zhe Zhang 2016-11-07 16:08:10 -08:00
parent 5231c527aa
commit 384b7b71a3
3 changed files with 36 additions and 4 deletions

View File

@ -99,6 +99,10 @@ public class MutableStat extends MutableMetric {
/** /**
* Add a number of samples and their sum to the running stat * Add a number of samples and their sum to the running stat
*
* Note that although use of this method will preserve accurate mean values,
* large values for numSamples may result in inaccurate variance values due
* to the use of a single step of the Welford method for variance calculation.
* @param numSamples number of samples * @param numSamples number of samples
* @param sum of the samples * @param sum of the samples
*/ */

View File

@ -27,29 +27,32 @@ import org.apache.hadoop.classification.InterfaceAudience;
public class SampleStat { public class SampleStat {
private final MinMax minmax = new MinMax(); private final MinMax minmax = new MinMax();
private long numSamples = 0; private long numSamples = 0;
private double a0, a1, s0, s1; private double a0, a1, s0, s1, total;
/** /**
* Construct a new running sample stat * Construct a new running sample stat
*/ */
public SampleStat() { public SampleStat() {
a0 = s0 = 0.0; a0 = s0 = 0.0;
total = 0.0;
} }
public void reset() { public void reset() {
numSamples = 0; numSamples = 0;
a0 = s0 = 0.0; a0 = s0 = 0.0;
total = 0.0;
minmax.reset(); minmax.reset();
} }
// We want to reuse the object, sometimes. // We want to reuse the object, sometimes.
void reset(long numSamples, double a0, double a1, double s0, double s1, void reset(long numSamples, double a0, double a1, double s0, double s1,
MinMax minmax) { double total, MinMax minmax) {
this.numSamples = numSamples; this.numSamples = numSamples;
this.a0 = a0; this.a0 = a0;
this.a1 = a1; this.a1 = a1;
this.s0 = s0; this.s0 = s0;
this.s1 = s1; this.s1 = s1;
this.total = total;
this.minmax.reset(minmax); this.minmax.reset(minmax);
} }
@ -58,7 +61,7 @@ public class SampleStat {
* @param other the destination to hold our values * @param other the destination to hold our values
*/ */
public void copyTo(SampleStat other) { public void copyTo(SampleStat other) {
other.reset(numSamples, a0, a1, s0, s1, minmax); other.reset(numSamples, a0, a1, s0, s1, total, minmax);
} }
/** /**
@ -80,6 +83,7 @@ public class SampleStat {
*/ */
public SampleStat add(long nSamples, double x) { public SampleStat add(long nSamples, double x) {
numSamples += nSamples; numSamples += nSamples;
total += x;
if (numSamples == 1) { if (numSamples == 1) {
a0 = a1 = x; a0 = a1 = x;
@ -102,11 +106,18 @@ public class SampleStat {
return numSamples; return numSamples;
} }
/**
* @return the total of all samples added
*/
public double total() {
return total;
}
/** /**
* @return the arithmetic mean of the samples * @return the arithmetic mean of the samples
*/ */
public double mean() { public double mean() {
return numSamples > 0 ? a1 : 0.0; return numSamples > 0 ? (total / numSamples) : 0.0;
} }
/** /**

View File

@ -115,6 +115,23 @@ public class TestMutableMetrics {
assertGauge("BarAvgTime", 0.0, rb); assertGauge("BarAvgTime", 0.0, rb);
} }
/**
* Tests that when using {@link MutableStat#add(long, long)}, even with a high
* sample count, the mean does not lose accuracy.
*/
@Test public void testMutableStatWithBulkAdd() {
MetricsRecordBuilder rb = mockMetricsRecordBuilder();
MetricsRegistry registry = new MetricsRegistry("test");
MutableStat stat = registry.newStat("Test", "Test", "Ops", "Val", false);
stat.add(1000, 1000);
stat.add(1000, 2000);
registry.snapshot(rb, false);
assertCounter("TestNumOps", 2000L, rb);
assertGauge("TestAvgVal", 1.5, rb);
}
/** /**
* Ensure that quantile estimates from {@link MutableQuantiles} are within * Ensure that quantile estimates from {@link MutableQuantiles} are within
* specified error bounds. * specified error bounds.