mirror of https://github.com/apache/druid.git
bug: getQuantiles() returns values that exceed max (#4744)
Fixes https://github.com/druid-io/druid/issues/3972
This commit is contained in:
parent
bf8fd4c203
commit
e60bc0cabc
|
@ -1546,6 +1546,20 @@ public class ApproximateHistogram
|
||||||
final long[] bins = this.bins();
|
final long[] bins = this.bins();
|
||||||
|
|
||||||
for (int j = 0; j < probabilities.length; ++j) {
|
for (int j = 0; j < probabilities.length; ++j) {
|
||||||
|
// Adapted from Ben-Haiv/Tom-Tov (Algorithm 4: Uniform Procedure)
|
||||||
|
// Our histogram has a set of bins {(p1, m1), (p2, m2), ... (pB, mB)}
|
||||||
|
// and properties of min and max saved during construction, such
|
||||||
|
// that min <= p1 and pB <= max.
|
||||||
|
//
|
||||||
|
// When min < p1 or pB < max, these are special cases of using the
|
||||||
|
// dummy bins (p0, 0) or (pB+1, 0) respectively, where p0 == min
|
||||||
|
// and pB+1 == max.
|
||||||
|
//
|
||||||
|
// This histogram gives an ordered set of numbers {pi, pi+1, ..., pn},
|
||||||
|
// such that min <= pi < pn <= max, and n is the sum({m1, ..., mB}).
|
||||||
|
// We use s to determine which pairs of (pi, mi), (pi+1, mi+1) to
|
||||||
|
// calculate our quantile from by computing sum([pi,mi]) < s < sum
|
||||||
|
// ([pi+1, mi+1])
|
||||||
final double s = probabilities[j] * this.count();
|
final double s = probabilities[j] * this.count();
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
@ -1564,6 +1578,7 @@ public class ApproximateHistogram
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
|
// At the first bin, there are no points to the left of p (min)
|
||||||
quantiles[j] = this.min();
|
quantiles[j] = this.min();
|
||||||
} else {
|
} else {
|
||||||
final double d = s - sum;
|
final double d = s - sum;
|
||||||
|
@ -1577,7 +1592,11 @@ public class ApproximateHistogram
|
||||||
z = (-b + Math.sqrt(b * b - 4 * a * c)) / (2 * a);
|
z = (-b + Math.sqrt(b * b - 4 * a * c)) / (2 * a);
|
||||||
}
|
}
|
||||||
final double uj = this.positions[i - 1] + (this.positions[i] - this.positions[i - 1]) * z;
|
final double uj = this.positions[i - 1] + (this.positions[i] - this.positions[i - 1]) * z;
|
||||||
quantiles[j] = (float) uj;
|
// A given bin (p, m) has m/2 points to the left and right of p, and
|
||||||
|
// uj could be one of those points. However, uj is still subject to:
|
||||||
|
// [min] (p0, 0) < uj < (p, m) or
|
||||||
|
// (p, m) < uj < (pB+1, 0) [max]
|
||||||
|
quantiles[j] = ((float) uj < this.max()) ? (float) uj : this.max();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,12 @@ public class ApproximateHistogramTest
|
||||||
static final float[] VALUES5 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
static final float[] VALUES5 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||||
static final float[] VALUES6 = {1f, 1.5f, 2f, 2.5f, 3f, 3.5f, 4f, 4.5f, 5f, 5.5f, 6f, 6.5f, 7f, 7.5f, 8f, 8.5f, 9f, 9.5f, 10f};
|
static final float[] VALUES6 = {1f, 1.5f, 2f, 2.5f, 3f, 3.5f, 4f, 4.5f, 5f, 5.5f, 6f, 6.5f, 7f, 7.5f, 8f, 8.5f, 9f, 9.5f, 10f};
|
||||||
|
|
||||||
|
// Based on the example from https://metamarkets.com/2013/histograms/
|
||||||
|
// This dataset can make getQuantiles() return values exceeding max
|
||||||
|
// for example: q=0.95 returns 25.16 when max=25
|
||||||
|
static final float[] VALUES7 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 12, 12,
|
||||||
|
15, 20, 25, 25, 25};
|
||||||
|
|
||||||
protected ApproximateHistogram buildHistogram(int size, float[] values)
|
protected ApproximateHistogram buildHistogram(int size, float[] values)
|
||||||
{
|
{
|
||||||
ApproximateHistogram h = new ApproximateHistogram(size);
|
ApproximateHistogram h = new ApproximateHistogram(size);
|
||||||
|
@ -393,6 +399,35 @@ public class ApproximateHistogramTest
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testQuantileBetweenMinMax()
|
||||||
|
{
|
||||||
|
ApproximateHistogram h = buildHistogram(20, VALUES7);
|
||||||
|
|
||||||
|
Assert.assertTrue(
|
||||||
|
"min value incorrect",
|
||||||
|
VALUES7[0] == h.min()
|
||||||
|
);
|
||||||
|
Assert.assertTrue(
|
||||||
|
"max value incorrect",
|
||||||
|
VALUES7[VALUES7.length - 1] == h.max()
|
||||||
|
);
|
||||||
|
|
||||||
|
Assert.assertArrayEquals(
|
||||||
|
"expected quantiles match actual quantiles",
|
||||||
|
new float[]{1.8f, 3.6f, 5.4f, 7.2f, 9f, 11.05f, 12.37f, 17f, 23.5f},
|
||||||
|
h.getQuantiles(new float[]{.1f, .2f, .3f, .4f, .5f, .6f, .7f, .8f,
|
||||||
|
.9f}), 0.1f
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test for outliers (0.05f and 0.95f, which should be min <= value <= max)
|
||||||
|
Assert.assertArrayEquals(
|
||||||
|
"expected quantiles match actual quantiles",
|
||||||
|
new float[]{h.min(), h.max()},
|
||||||
|
h.getQuantiles(new float[]{.05f, .95f}), 0.1f
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testQuantileBigger()
|
public void testQuantileBigger()
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue