From 844d62673888899a4b7be8b50929adcf057e7808 Mon Sep 17 00:00:00 2001 From: Alexander Saydakov <13126686+AlexanderSaydakov@users.noreply.github.com> Date: Mon, 4 May 2020 16:53:09 -0700 Subject: [PATCH] added number of bins parameter (#9436) * added number of bins parameter * addressed review points * test equals Co-authored-by: AlexanderSaydakov --- .../extensions-core/datasketches-quantiles.md | 5 +- extensions-core/datasketches/pom.xml | 5 ++ ...oublesSketchToHistogramPostAggregator.java | 69 ++++++++++++++++--- ...esSketchToHistogramOperatorConversion.java | 5 +- ...esSketchToHistogramPostAggregatorTest.java | 47 ++++++++++++- .../sql/DoublesSketchSqlAggregatorTest.java | 3 +- 6 files changed, 118 insertions(+), 16 deletions(-) diff --git a/docs/development/extensions-core/datasketches-quantiles.md b/docs/development/extensions-core/datasketches-quantiles.md index 88b406c4503..775b9132204 100644 --- a/docs/development/extensions-core/datasketches-quantiles.md +++ b/docs/development/extensions-core/datasketches-quantiles.md @@ -87,14 +87,15 @@ This returns an array of quantiles corresponding to a given array of fractions #### Histogram -This returns an approximation to the histogram given an array of split points that define the histogram bins. An array of m unique, monotonically increasing split points divide the real number line into m+1 consecutive disjoint intervals. The definition of an interval is inclusive of the left split point and exclusive of the right split point. +This returns an approximation to the histogram given an array of split points that define the histogram bins or a number of bins (not both). An array of m unique, monotonically increasing split points divide the real number line into m+1 consecutive disjoint intervals. The definition of an interval is inclusive of the left split point and exclusive of the right split point. If the number of bins is specified instead of split points, the interval between the minimum and maximum values is divided into the given number of equally-spaced bins. ```json { "type" : "quantilesDoublesSketchToHistogram", "name": , "field" : , - "splitPoints" : + "splitPoints" : , + "numBins" : } ``` diff --git a/extensions-core/datasketches/pom.xml b/extensions-core/datasketches/pom.xml index 42df8aaeefa..48aae4cef47 100644 --- a/extensions-core/datasketches/pom.xml +++ b/extensions-core/datasketches/pom.xml @@ -150,6 +150,11 @@ jackson-jaxrs-smile-provider provided + + nl.jqno.equalsverifier + equalsverifier + test + diff --git a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregator.java b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregator.java index eb135a2656d..16a235c6656 100644 --- a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregator.java +++ b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregator.java @@ -20,6 +20,7 @@ package org.apache.druid.query.aggregation.datasketches.quantiles; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import org.apache.datasketches.quantiles.DoublesSketch; @@ -29,6 +30,7 @@ import org.apache.druid.query.aggregation.AggregatorUtil; import org.apache.druid.query.aggregation.PostAggregator; import org.apache.druid.query.cache.CacheKeyBuilder; +import javax.annotation.Nullable; import java.util.Arrays; import java.util.Comparator; import java.util.Map; @@ -36,38 +38,63 @@ import java.util.Set; public class DoublesSketchToHistogramPostAggregator implements PostAggregator { + static final int DEFAULT_NUM_BINS = 10; private final String name; private final PostAggregator field; private final double[] splitPoints; + private final Integer numBins; @JsonCreator public DoublesSketchToHistogramPostAggregator( @JsonProperty("name") final String name, @JsonProperty("field") final PostAggregator field, - @JsonProperty("splitPoints") final double[] splitPoints) + @JsonProperty("splitPoints") @Nullable final double[] splitPoints, + @JsonProperty("numBins") @Nullable final Integer numBins) { this.name = Preconditions.checkNotNull(name, "name is null"); this.field = Preconditions.checkNotNull(field, "field is null"); - this.splitPoints = Preconditions.checkNotNull(splitPoints, "array of split points is null"); + this.splitPoints = splitPoints; + this.numBins = numBins; + if (splitPoints != null && numBins != null) { + throw new IAE("Cannot accept both 'splitPoints' and 'numBins'"); + } } @Override public Object compute(final Map combinedAggregators) { final DoublesSketch sketch = (DoublesSketch) field.compute(combinedAggregators); + final int numBins = splitPoints != null ? splitPoints.length + 1 : + (this.numBins != null ? this.numBins.intValue() : DEFAULT_NUM_BINS); + if (numBins < 2) { + throw new IAE("at least 2 bins expected"); + } if (sketch.isEmpty()) { - final double[] histogram = new double[splitPoints.length + 1]; + final double[] histogram = new double[numBins]; Arrays.fill(histogram, Double.NaN); return histogram; } - final double[] histogram = sketch.getPMF(splitPoints); + final double[] histogram = sketch.getPMF(splitPoints != null ? splitPoints : + equallySpacedPoints(numBins, sketch.getMinValue(), sketch.getMaxValue())); for (int i = 0; i < histogram.length; i++) { - histogram[i] *= sketch.getN(); + histogram[i] *= sketch.getN(); // scale fractions to counts } return histogram; } + // retuns num-1 points that split the interval [min, max] into num equally-spaced intervals + // num must be at least 2 + private static double[] equallySpacedPoints(final int num, final double min, final double max) + { + final double[] points = new double[num - 1]; + final double delta = (max - min) / num; + for (int i = 0; i < num - 1; i++) { + points[i] = min + delta * (i + 1); + } + return points; + } + @Override @JsonProperty public String getName() @@ -82,11 +109,19 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator } @JsonProperty + @JsonInclude(JsonInclude.Include.NON_NULL) public double[] getSplitPoints() { return splitPoints; } + @JsonProperty + @JsonInclude(JsonInclude.Include.NON_NULL) + public Integer getNumBins() + { + return numBins; + } + @Override public Comparator getComparator() { @@ -106,6 +141,7 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator "name='" + name + '\'' + ", field=" + field + ", splitPoints=" + Arrays.toString(splitPoints) + + ", numBins=" + numBins + "}"; } @@ -125,7 +161,16 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator if (!Arrays.equals(splitPoints, that.splitPoints)) { return false; } - return field.equals(that.field); + if (!field.equals(that.field)) { + return false; + } + if (numBins == null && that.numBins == null) { + return true; + } + if (numBins != null && numBins.equals(that.numBins)) { + return true; + } + return false; } @Override @@ -133,6 +178,9 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator { int hashCode = name.hashCode() * 31 + field.hashCode(); hashCode = hashCode * 31 + Arrays.hashCode(splitPoints); + if (numBins != null) { + hashCode = hashCode * 31 + numBins.hashCode(); + } return hashCode; } @@ -141,8 +189,13 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator { final CacheKeyBuilder builder = new CacheKeyBuilder( AggregatorUtil.QUANTILES_DOUBLES_SKETCH_TO_HISTOGRAM_CACHE_TYPE_ID).appendCacheable(field); - for (final double value : splitPoints) { - builder.appendDouble(value); + if (splitPoints != null) { + for (final double value : splitPoints) { + builder.appendDouble(value); + } + } + if (numBins != null) { + builder.appendInt(numBins); } return builder.build(); } diff --git a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchToHistogramOperatorConversion.java b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchToHistogramOperatorConversion.java index a7e73c9e460..a6b30992b88 100644 --- a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchToHistogramOperatorConversion.java +++ b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchToHistogramOperatorConversion.java @@ -33,12 +33,13 @@ public class DoublesSketchToHistogramOperatorConversion extends DoublesSketchLis } @Override - public PostAggregator makePostAgg(String name, PostAggregator field, double[] args) + public PostAggregator makePostAgg(String name, PostAggregator field, double[] points) { return new DoublesSketchToHistogramPostAggregator( name, field, - args + points, + null ); } } diff --git a/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregatorTest.java b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregatorTest.java index b5aeec985fb..9b33c8a0ef6 100644 --- a/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregatorTest.java +++ b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/DoublesSketchToHistogramPostAggregatorTest.java @@ -19,6 +19,7 @@ package org.apache.druid.query.aggregation.datasketches.quantiles; +import nl.jqno.equalsverifier.EqualsVerifier; import org.apache.druid.query.aggregation.Aggregator; import org.apache.druid.query.aggregation.PostAggregator; import org.apache.druid.query.aggregation.TestDoubleColumnSelectorImpl; @@ -43,7 +44,8 @@ public class DoublesSketchToHistogramPostAggregatorTest final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator( "histogram", new FieldAccessPostAggregator("field", "sketch"), - new double[] {3.5} + new double[] {3.5}, + null ); final double[] histogram = (double[]) postAgg.compute(fields); @@ -54,7 +56,7 @@ public class DoublesSketchToHistogramPostAggregatorTest } @Test - public void normalCase() + public void splitPoints() { final double[] values = new double[] {1, 2, 3, 4, 5, 6}; final TestDoubleColumnSelectorImpl selector = new TestDoubleColumnSelectorImpl(values); @@ -72,7 +74,8 @@ public class DoublesSketchToHistogramPostAggregatorTest final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator( "histogram", new FieldAccessPostAggregator("field", "sketch"), - new double[] {3.5} // splits distribution in two buckets of equal mass + new double[] {3.5}, // splits distribution into two bins of equal mass + null ); final double[] histogram = (double[]) postAgg.compute(fields); @@ -81,4 +84,42 @@ public class DoublesSketchToHistogramPostAggregatorTest Assert.assertEquals(3.0, histogram[0], 0); Assert.assertEquals(3.0, histogram[1], 0); } + + @Test + public void numBins() + { + final double[] values = new double[] {1, 2, 3, 4, 5, 6}; + final TestDoubleColumnSelectorImpl selector = new TestDoubleColumnSelectorImpl(values); + + final Aggregator agg = new DoublesSketchBuildAggregator(selector, 8); + //noinspection ForLoopReplaceableByForEach + for (int i = 0; i < values.length; i++) { + agg.aggregate(); + selector.increment(); + } + + final Map fields = new HashMap<>(); + fields.put("sketch", agg.get()); + + final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator( + "histogram", + new FieldAccessPostAggregator("field", "sketch"), + null, + 2 // two bins of equal mass + ); + + final double[] histogram = (double[]) postAgg.compute(fields); + Assert.assertNotNull(histogram); + Assert.assertEquals(2, histogram.length); + Assert.assertEquals(3.0, histogram[0], 0); + Assert.assertEquals(3.0, histogram[1], 0); + } + + @Test + public void testEquals() + { + EqualsVerifier.forClass(DoublesSketchToHistogramPostAggregator.class) + .usingGetClass() + .verify(); + } } diff --git a/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchSqlAggregatorTest.java b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchSqlAggregatorTest.java index c265298a380..ef284651b90 100644 --- a/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchSqlAggregatorTest.java +++ b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/quantiles/sql/DoublesSketchSqlAggregatorTest.java @@ -655,7 +655,8 @@ public class DoublesSketchSqlAggregatorTest extends CalciteTestBase "p12", "a2:agg" ), - new double[]{0.2d, 0.6d} + new double[]{0.2d, 0.6d}, + null ), new DoublesSketchToRankPostAggregator( "p15",