bug: getQuantiles() returns values that exceed max (#4744)

Fixes https://github.com/druid-io/druid/issues/3972
This commit is contained in:
Gino Ledesma 2017-09-26 10:43:56 -07:00 committed by Gian Merlino
parent bf8fd4c203
commit e60bc0cabc
2 changed files with 55 additions and 1 deletions

View File

@ -1546,6 +1546,20 @@ public class ApproximateHistogram
final long[] bins = this.bins(); final long[] bins = this.bins();
for (int j = 0; j < probabilities.length; ++j) { for (int j = 0; j < probabilities.length; ++j) {
// Adapted from Ben-Haiv/Tom-Tov (Algorithm 4: Uniform Procedure)
// Our histogram has a set of bins {(p1, m1), (p2, m2), ... (pB, mB)}
// and properties of min and max saved during construction, such
// that min <= p1 and pB <= max.
//
// When min < p1 or pB < max, these are special cases of using the
// dummy bins (p0, 0) or (pB+1, 0) respectively, where p0 == min
// and pB+1 == max.
//
// This histogram gives an ordered set of numbers {pi, pi+1, ..., pn},
// such that min <= pi < pn <= max, and n is the sum({m1, ..., mB}).
// We use s to determine which pairs of (pi, mi), (pi+1, mi+1) to
// calculate our quantile from by computing sum([pi,mi]) < s < sum
// ([pi+1, mi+1])
final double s = probabilities[j] * this.count(); final double s = probabilities[j] * this.count();
int i = 0; int i = 0;
@ -1564,6 +1578,7 @@ public class ApproximateHistogram
} }
if (i == 0) { if (i == 0) {
// At the first bin, there are no points to the left of p (min)
quantiles[j] = this.min(); quantiles[j] = this.min();
} else { } else {
final double d = s - sum; final double d = s - sum;
@ -1577,7 +1592,11 @@ public class ApproximateHistogram
z = (-b + Math.sqrt(b * b - 4 * a * c)) / (2 * a); z = (-b + Math.sqrt(b * b - 4 * a * c)) / (2 * a);
} }
final double uj = this.positions[i - 1] + (this.positions[i] - this.positions[i - 1]) * z; final double uj = this.positions[i - 1] + (this.positions[i] - this.positions[i - 1]) * z;
quantiles[j] = (float) uj; // A given bin (p, m) has m/2 points to the left and right of p, and
// uj could be one of those points. However, uj is still subject to:
// [min] (p0, 0) < uj < (p, m) or
// (p, m) < uj < (pB+1, 0) [max]
quantiles[j] = ((float) uj < this.max()) ? (float) uj : this.max();
} }
} }

View File

@ -48,6 +48,12 @@ public class ApproximateHistogramTest
static final float[] VALUES5 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; static final float[] VALUES5 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
static final float[] VALUES6 = {1f, 1.5f, 2f, 2.5f, 3f, 3.5f, 4f, 4.5f, 5f, 5.5f, 6f, 6.5f, 7f, 7.5f, 8f, 8.5f, 9f, 9.5f, 10f}; static final float[] VALUES6 = {1f, 1.5f, 2f, 2.5f, 3f, 3.5f, 4f, 4.5f, 5f, 5.5f, 6f, 6.5f, 7f, 7.5f, 8f, 8.5f, 9f, 9.5f, 10f};
// Based on the example from https://metamarkets.com/2013/histograms/
// This dataset can make getQuantiles() return values exceeding max
// for example: q=0.95 returns 25.16 when max=25
static final float[] VALUES7 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 12, 12,
15, 20, 25, 25, 25};
protected ApproximateHistogram buildHistogram(int size, float[] values) protected ApproximateHistogram buildHistogram(int size, float[] values)
{ {
ApproximateHistogram h = new ApproximateHistogram(size); ApproximateHistogram h = new ApproximateHistogram(size);
@ -393,6 +399,35 @@ public class ApproximateHistogramTest
); );
} }
@Test
public void testQuantileBetweenMinMax()
{
ApproximateHistogram h = buildHistogram(20, VALUES7);
Assert.assertTrue(
"min value incorrect",
VALUES7[0] == h.min()
);
Assert.assertTrue(
"max value incorrect",
VALUES7[VALUES7.length - 1] == h.max()
);
Assert.assertArrayEquals(
"expected quantiles match actual quantiles",
new float[]{1.8f, 3.6f, 5.4f, 7.2f, 9f, 11.05f, 12.37f, 17f, 23.5f},
h.getQuantiles(new float[]{.1f, .2f, .3f, .4f, .5f, .6f, .7f, .8f,
.9f}), 0.1f
);
// Test for outliers (0.05f and 0.95f, which should be min <= value <= max)
Assert.assertArrayEquals(
"expected quantiles match actual quantiles",
new float[]{h.min(), h.max()},
h.getQuantiles(new float[]{.05f, .95f}), 0.1f
);
}
@Test @Test
public void testQuantileBigger() public void testQuantileBigger()
{ {