mirror of https://github.com/apache/druid.git
added number of bins parameter (#9436)
* added number of bins parameter * addressed review points * test equals Co-authored-by: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
This commit is contained in:
parent
85dfbb64cb
commit
844d626738
|
@ -87,14 +87,15 @@ This returns an array of quantiles corresponding to a given array of fractions
|
|||
|
||||
#### Histogram
|
||||
|
||||
This returns an approximation to the histogram given an array of split points that define the histogram bins. An array of <i>m</i> unique, monotonically increasing split points divide the real number line into <i>m+1</i> consecutive disjoint intervals. The definition of an interval is inclusive of the left split point and exclusive of the right split point.
|
||||
This returns an approximation to the histogram given an array of split points that define the histogram bins or a number of bins (not both). An array of <i>m</i> unique, monotonically increasing split points divide the real number line into <i>m+1</i> consecutive disjoint intervals. The definition of an interval is inclusive of the left split point and exclusive of the right split point. If the number of bins is specified instead of split points, the interval between the minimum and maximum values is divided into the given number of equally-spaced bins.
|
||||
|
||||
```json
|
||||
{
|
||||
"type" : "quantilesDoublesSketchToHistogram",
|
||||
"name": <output name>,
|
||||
"field" : <post aggregator that refers to a DoublesSketch (fieldAccess or another post aggregator)>,
|
||||
"splitPoints" : <array of split points>
|
||||
"splitPoints" : <array of split points (optional)>,
|
||||
"numBins" : <number of bins (optional, defaults to 10)>
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -150,6 +150,11 @@
|
|||
<artifactId>jackson-jaxrs-smile-provider</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>nl.jqno.equalsverifier</groupId>
|
||||
<artifactId>equalsverifier</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Test Dependencies -->
|
||||
<dependency>
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.apache.druid.query.aggregation.datasketches.quantiles;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.google.common.base.Preconditions;
|
||||
import org.apache.datasketches.quantiles.DoublesSketch;
|
||||
|
@ -29,6 +30,7 @@ import org.apache.druid.query.aggregation.AggregatorUtil;
|
|||
import org.apache.druid.query.aggregation.PostAggregator;
|
||||
import org.apache.druid.query.cache.CacheKeyBuilder;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
|
@ -36,38 +38,63 @@ import java.util.Set;
|
|||
|
||||
public class DoublesSketchToHistogramPostAggregator implements PostAggregator
|
||||
{
|
||||
static final int DEFAULT_NUM_BINS = 10;
|
||||
|
||||
private final String name;
|
||||
private final PostAggregator field;
|
||||
private final double[] splitPoints;
|
||||
private final Integer numBins;
|
||||
|
||||
@JsonCreator
|
||||
public DoublesSketchToHistogramPostAggregator(
|
||||
@JsonProperty("name") final String name,
|
||||
@JsonProperty("field") final PostAggregator field,
|
||||
@JsonProperty("splitPoints") final double[] splitPoints)
|
||||
@JsonProperty("splitPoints") @Nullable final double[] splitPoints,
|
||||
@JsonProperty("numBins") @Nullable final Integer numBins)
|
||||
{
|
||||
this.name = Preconditions.checkNotNull(name, "name is null");
|
||||
this.field = Preconditions.checkNotNull(field, "field is null");
|
||||
this.splitPoints = Preconditions.checkNotNull(splitPoints, "array of split points is null");
|
||||
this.splitPoints = splitPoints;
|
||||
this.numBins = numBins;
|
||||
if (splitPoints != null && numBins != null) {
|
||||
throw new IAE("Cannot accept both 'splitPoints' and 'numBins'");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object compute(final Map<String, Object> combinedAggregators)
|
||||
{
|
||||
final DoublesSketch sketch = (DoublesSketch) field.compute(combinedAggregators);
|
||||
final int numBins = splitPoints != null ? splitPoints.length + 1 :
|
||||
(this.numBins != null ? this.numBins.intValue() : DEFAULT_NUM_BINS);
|
||||
if (numBins < 2) {
|
||||
throw new IAE("at least 2 bins expected");
|
||||
}
|
||||
if (sketch.isEmpty()) {
|
||||
final double[] histogram = new double[splitPoints.length + 1];
|
||||
final double[] histogram = new double[numBins];
|
||||
Arrays.fill(histogram, Double.NaN);
|
||||
return histogram;
|
||||
}
|
||||
final double[] histogram = sketch.getPMF(splitPoints);
|
||||
final double[] histogram = sketch.getPMF(splitPoints != null ? splitPoints :
|
||||
equallySpacedPoints(numBins, sketch.getMinValue(), sketch.getMaxValue()));
|
||||
for (int i = 0; i < histogram.length; i++) {
|
||||
histogram[i] *= sketch.getN();
|
||||
histogram[i] *= sketch.getN(); // scale fractions to counts
|
||||
}
|
||||
return histogram;
|
||||
}
|
||||
|
||||
// retuns num-1 points that split the interval [min, max] into num equally-spaced intervals
|
||||
// num must be at least 2
|
||||
private static double[] equallySpacedPoints(final int num, final double min, final double max)
|
||||
{
|
||||
final double[] points = new double[num - 1];
|
||||
final double delta = (max - min) / num;
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
points[i] = min + delta * (i + 1);
|
||||
}
|
||||
return points;
|
||||
}
|
||||
|
||||
@Override
|
||||
@JsonProperty
|
||||
public String getName()
|
||||
|
@ -82,11 +109,19 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
|
|||
}
|
||||
|
||||
@JsonProperty
|
||||
@JsonInclude(JsonInclude.Include.NON_NULL)
|
||||
public double[] getSplitPoints()
|
||||
{
|
||||
return splitPoints;
|
||||
}
|
||||
|
||||
@JsonProperty
|
||||
@JsonInclude(JsonInclude.Include.NON_NULL)
|
||||
public Integer getNumBins()
|
||||
{
|
||||
return numBins;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<double[]> getComparator()
|
||||
{
|
||||
|
@ -106,6 +141,7 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
|
|||
"name='" + name + '\'' +
|
||||
", field=" + field +
|
||||
", splitPoints=" + Arrays.toString(splitPoints) +
|
||||
", numBins=" + numBins +
|
||||
"}";
|
||||
}
|
||||
|
||||
|
@ -125,7 +161,16 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
|
|||
if (!Arrays.equals(splitPoints, that.splitPoints)) {
|
||||
return false;
|
||||
}
|
||||
return field.equals(that.field);
|
||||
if (!field.equals(that.field)) {
|
||||
return false;
|
||||
}
|
||||
if (numBins == null && that.numBins == null) {
|
||||
return true;
|
||||
}
|
||||
if (numBins != null && numBins.equals(that.numBins)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -133,6 +178,9 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
|
|||
{
|
||||
int hashCode = name.hashCode() * 31 + field.hashCode();
|
||||
hashCode = hashCode * 31 + Arrays.hashCode(splitPoints);
|
||||
if (numBins != null) {
|
||||
hashCode = hashCode * 31 + numBins.hashCode();
|
||||
}
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
|
@ -141,8 +189,13 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
|
|||
{
|
||||
final CacheKeyBuilder builder = new CacheKeyBuilder(
|
||||
AggregatorUtil.QUANTILES_DOUBLES_SKETCH_TO_HISTOGRAM_CACHE_TYPE_ID).appendCacheable(field);
|
||||
for (final double value : splitPoints) {
|
||||
builder.appendDouble(value);
|
||||
if (splitPoints != null) {
|
||||
for (final double value : splitPoints) {
|
||||
builder.appendDouble(value);
|
||||
}
|
||||
}
|
||||
if (numBins != null) {
|
||||
builder.appendInt(numBins);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
|
|
@ -33,12 +33,13 @@ public class DoublesSketchToHistogramOperatorConversion extends DoublesSketchLis
|
|||
}
|
||||
|
||||
@Override
|
||||
public PostAggregator makePostAgg(String name, PostAggregator field, double[] args)
|
||||
public PostAggregator makePostAgg(String name, PostAggregator field, double[] points)
|
||||
{
|
||||
return new DoublesSketchToHistogramPostAggregator(
|
||||
name,
|
||||
field,
|
||||
args
|
||||
points,
|
||||
null
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.apache.druid.query.aggregation.datasketches.quantiles;
|
||||
|
||||
import nl.jqno.equalsverifier.EqualsVerifier;
|
||||
import org.apache.druid.query.aggregation.Aggregator;
|
||||
import org.apache.druid.query.aggregation.PostAggregator;
|
||||
import org.apache.druid.query.aggregation.TestDoubleColumnSelectorImpl;
|
||||
|
@ -43,7 +44,8 @@ public class DoublesSketchToHistogramPostAggregatorTest
|
|||
final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator(
|
||||
"histogram",
|
||||
new FieldAccessPostAggregator("field", "sketch"),
|
||||
new double[] {3.5}
|
||||
new double[] {3.5},
|
||||
null
|
||||
);
|
||||
|
||||
final double[] histogram = (double[]) postAgg.compute(fields);
|
||||
|
@ -54,7 +56,7 @@ public class DoublesSketchToHistogramPostAggregatorTest
|
|||
}
|
||||
|
||||
@Test
|
||||
public void normalCase()
|
||||
public void splitPoints()
|
||||
{
|
||||
final double[] values = new double[] {1, 2, 3, 4, 5, 6};
|
||||
final TestDoubleColumnSelectorImpl selector = new TestDoubleColumnSelectorImpl(values);
|
||||
|
@ -72,7 +74,8 @@ public class DoublesSketchToHistogramPostAggregatorTest
|
|||
final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator(
|
||||
"histogram",
|
||||
new FieldAccessPostAggregator("field", "sketch"),
|
||||
new double[] {3.5} // splits distribution in two buckets of equal mass
|
||||
new double[] {3.5}, // splits distribution into two bins of equal mass
|
||||
null
|
||||
);
|
||||
|
||||
final double[] histogram = (double[]) postAgg.compute(fields);
|
||||
|
@ -81,4 +84,42 @@ public class DoublesSketchToHistogramPostAggregatorTest
|
|||
Assert.assertEquals(3.0, histogram[0], 0);
|
||||
Assert.assertEquals(3.0, histogram[1], 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void numBins()
|
||||
{
|
||||
final double[] values = new double[] {1, 2, 3, 4, 5, 6};
|
||||
final TestDoubleColumnSelectorImpl selector = new TestDoubleColumnSelectorImpl(values);
|
||||
|
||||
final Aggregator agg = new DoublesSketchBuildAggregator(selector, 8);
|
||||
//noinspection ForLoopReplaceableByForEach
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
agg.aggregate();
|
||||
selector.increment();
|
||||
}
|
||||
|
||||
final Map<String, Object> fields = new HashMap<>();
|
||||
fields.put("sketch", agg.get());
|
||||
|
||||
final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator(
|
||||
"histogram",
|
||||
new FieldAccessPostAggregator("field", "sketch"),
|
||||
null,
|
||||
2 // two bins of equal mass
|
||||
);
|
||||
|
||||
final double[] histogram = (double[]) postAgg.compute(fields);
|
||||
Assert.assertNotNull(histogram);
|
||||
Assert.assertEquals(2, histogram.length);
|
||||
Assert.assertEquals(3.0, histogram[0], 0);
|
||||
Assert.assertEquals(3.0, histogram[1], 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEquals()
|
||||
{
|
||||
EqualsVerifier.forClass(DoublesSketchToHistogramPostAggregator.class)
|
||||
.usingGetClass()
|
||||
.verify();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -655,7 +655,8 @@ public class DoublesSketchSqlAggregatorTest extends CalciteTestBase
|
|||
"p12",
|
||||
"a2:agg"
|
||||
),
|
||||
new double[]{0.2d, 0.6d}
|
||||
new double[]{0.2d, 0.6d},
|
||||
null
|
||||
),
|
||||
new DoublesSketchToRankPostAggregator(
|
||||
"p15",
|
||||
|
|
Loading…
Reference in New Issue