mirror of https://github.com/apache/druid.git
bug: getQuantiles() returns values that exceed max (#4744)
Fixes https://github.com/druid-io/druid/issues/3972
This commit is contained in:
parent
bf8fd4c203
commit
e60bc0cabc
|
@ -1546,6 +1546,20 @@ public class ApproximateHistogram
|
|||
final long[] bins = this.bins();
|
||||
|
||||
for (int j = 0; j < probabilities.length; ++j) {
|
||||
// Adapted from Ben-Haiv/Tom-Tov (Algorithm 4: Uniform Procedure)
|
||||
// Our histogram has a set of bins {(p1, m1), (p2, m2), ... (pB, mB)}
|
||||
// and properties of min and max saved during construction, such
|
||||
// that min <= p1 and pB <= max.
|
||||
//
|
||||
// When min < p1 or pB < max, these are special cases of using the
|
||||
// dummy bins (p0, 0) or (pB+1, 0) respectively, where p0 == min
|
||||
// and pB+1 == max.
|
||||
//
|
||||
// This histogram gives an ordered set of numbers {pi, pi+1, ..., pn},
|
||||
// such that min <= pi < pn <= max, and n is the sum({m1, ..., mB}).
|
||||
// We use s to determine which pairs of (pi, mi), (pi+1, mi+1) to
|
||||
// calculate our quantile from by computing sum([pi,mi]) < s < sum
|
||||
// ([pi+1, mi+1])
|
||||
final double s = probabilities[j] * this.count();
|
||||
|
||||
int i = 0;
|
||||
|
@ -1564,6 +1578,7 @@ public class ApproximateHistogram
|
|||
}
|
||||
|
||||
if (i == 0) {
|
||||
// At the first bin, there are no points to the left of p (min)
|
||||
quantiles[j] = this.min();
|
||||
} else {
|
||||
final double d = s - sum;
|
||||
|
@ -1577,7 +1592,11 @@ public class ApproximateHistogram
|
|||
z = (-b + Math.sqrt(b * b - 4 * a * c)) / (2 * a);
|
||||
}
|
||||
final double uj = this.positions[i - 1] + (this.positions[i] - this.positions[i - 1]) * z;
|
||||
quantiles[j] = (float) uj;
|
||||
// A given bin (p, m) has m/2 points to the left and right of p, and
|
||||
// uj could be one of those points. However, uj is still subject to:
|
||||
// [min] (p0, 0) < uj < (p, m) or
|
||||
// (p, m) < uj < (pB+1, 0) [max]
|
||||
quantiles[j] = ((float) uj < this.max()) ? (float) uj : this.max();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -48,6 +48,12 @@ public class ApproximateHistogramTest
|
|||
static final float[] VALUES5 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||
static final float[] VALUES6 = {1f, 1.5f, 2f, 2.5f, 3f, 3.5f, 4f, 4.5f, 5f, 5.5f, 6f, 6.5f, 7f, 7.5f, 8f, 8.5f, 9f, 9.5f, 10f};
|
||||
|
||||
// Based on the example from https://metamarkets.com/2013/histograms/
|
||||
// This dataset can make getQuantiles() return values exceeding max
|
||||
// for example: q=0.95 returns 25.16 when max=25
|
||||
static final float[] VALUES7 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 12, 12,
|
||||
15, 20, 25, 25, 25};
|
||||
|
||||
protected ApproximateHistogram buildHistogram(int size, float[] values)
|
||||
{
|
||||
ApproximateHistogram h = new ApproximateHistogram(size);
|
||||
|
@ -393,6 +399,35 @@ public class ApproximateHistogramTest
|
|||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuantileBetweenMinMax()
|
||||
{
|
||||
ApproximateHistogram h = buildHistogram(20, VALUES7);
|
||||
|
||||
Assert.assertTrue(
|
||||
"min value incorrect",
|
||||
VALUES7[0] == h.min()
|
||||
);
|
||||
Assert.assertTrue(
|
||||
"max value incorrect",
|
||||
VALUES7[VALUES7.length - 1] == h.max()
|
||||
);
|
||||
|
||||
Assert.assertArrayEquals(
|
||||
"expected quantiles match actual quantiles",
|
||||
new float[]{1.8f, 3.6f, 5.4f, 7.2f, 9f, 11.05f, 12.37f, 17f, 23.5f},
|
||||
h.getQuantiles(new float[]{.1f, .2f, .3f, .4f, .5f, .6f, .7f, .8f,
|
||||
.9f}), 0.1f
|
||||
);
|
||||
|
||||
// Test for outliers (0.05f and 0.95f, which should be min <= value <= max)
|
||||
Assert.assertArrayEquals(
|
||||
"expected quantiles match actual quantiles",
|
||||
new float[]{h.min(), h.max()},
|
||||
h.getQuantiles(new float[]{.05f, .95f}), 0.1f
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuantileBigger()
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue