added number of bins parameter (#9436)

* added number of bins parameter

* addressed review points

* test equals

Co-authored-by: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
This commit is contained in:
Alexander Saydakov 2020-05-04 16:53:09 -07:00 committed by GitHub
parent 85dfbb64cb
commit 844d626738
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 118 additions and 16 deletions

View File

@ -87,14 +87,15 @@ This returns an array of quantiles corresponding to a given array of fractions
#### Histogram
This returns an approximation to the histogram given an array of split points that define the histogram bins. An array of <i>m</i> unique, monotonically increasing split points divide the real number line into <i>m+1</i> consecutive disjoint intervals. The definition of an interval is inclusive of the left split point and exclusive of the right split point.
This returns an approximation to the histogram given an array of split points that define the histogram bins or a number of bins (not both). An array of <i>m</i> unique, monotonically increasing split points divide the real number line into <i>m+1</i> consecutive disjoint intervals. The definition of an interval is inclusive of the left split point and exclusive of the right split point. If the number of bins is specified instead of split points, the interval between the minimum and maximum values is divided into the given number of equally-spaced bins.
```json
{
"type" : "quantilesDoublesSketchToHistogram",
"name": <output name>,
"field" : <post aggregator that refers to a DoublesSketch (fieldAccess or another post aggregator)>,
"splitPoints" : <array of split points>
"splitPoints" : <array of split points (optional)>,
"numBins" : <number of bins (optional, defaults to 10)>
}
```

View File

@ -150,6 +150,11 @@
<artifactId>jackson-jaxrs-smile-provider</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>nl.jqno.equalsverifier</groupId>
<artifactId>equalsverifier</artifactId>
<scope>test</scope>
</dependency>
<!-- Test Dependencies -->
<dependency>

View File

@ -20,6 +20,7 @@
package org.apache.druid.query.aggregation.datasketches.quantiles;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import org.apache.datasketches.quantiles.DoublesSketch;
@ -29,6 +30,7 @@ import org.apache.druid.query.aggregation.AggregatorUtil;
import org.apache.druid.query.aggregation.PostAggregator;
import org.apache.druid.query.cache.CacheKeyBuilder;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
@ -36,38 +38,63 @@ import java.util.Set;
public class DoublesSketchToHistogramPostAggregator implements PostAggregator
{
static final int DEFAULT_NUM_BINS = 10;
private final String name;
private final PostAggregator field;
private final double[] splitPoints;
private final Integer numBins;
@JsonCreator
public DoublesSketchToHistogramPostAggregator(
@JsonProperty("name") final String name,
@JsonProperty("field") final PostAggregator field,
@JsonProperty("splitPoints") final double[] splitPoints)
@JsonProperty("splitPoints") @Nullable final double[] splitPoints,
@JsonProperty("numBins") @Nullable final Integer numBins)
{
this.name = Preconditions.checkNotNull(name, "name is null");
this.field = Preconditions.checkNotNull(field, "field is null");
this.splitPoints = Preconditions.checkNotNull(splitPoints, "array of split points is null");
this.splitPoints = splitPoints;
this.numBins = numBins;
if (splitPoints != null && numBins != null) {
throw new IAE("Cannot accept both 'splitPoints' and 'numBins'");
}
}
@Override
public Object compute(final Map<String, Object> combinedAggregators)
{
final DoublesSketch sketch = (DoublesSketch) field.compute(combinedAggregators);
final int numBins = splitPoints != null ? splitPoints.length + 1 :
(this.numBins != null ? this.numBins.intValue() : DEFAULT_NUM_BINS);
if (numBins < 2) {
throw new IAE("at least 2 bins expected");
}
if (sketch.isEmpty()) {
final double[] histogram = new double[splitPoints.length + 1];
final double[] histogram = new double[numBins];
Arrays.fill(histogram, Double.NaN);
return histogram;
}
final double[] histogram = sketch.getPMF(splitPoints);
final double[] histogram = sketch.getPMF(splitPoints != null ? splitPoints :
equallySpacedPoints(numBins, sketch.getMinValue(), sketch.getMaxValue()));
for (int i = 0; i < histogram.length; i++) {
histogram[i] *= sketch.getN();
histogram[i] *= sketch.getN(); // scale fractions to counts
}
return histogram;
}
// retuns num-1 points that split the interval [min, max] into num equally-spaced intervals
// num must be at least 2
private static double[] equallySpacedPoints(final int num, final double min, final double max)
{
final double[] points = new double[num - 1];
final double delta = (max - min) / num;
for (int i = 0; i < num - 1; i++) {
points[i] = min + delta * (i + 1);
}
return points;
}
@Override
@JsonProperty
public String getName()
@ -82,11 +109,19 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
}
@JsonProperty
@JsonInclude(JsonInclude.Include.NON_NULL)
public double[] getSplitPoints()
{
return splitPoints;
}
@JsonProperty
@JsonInclude(JsonInclude.Include.NON_NULL)
public Integer getNumBins()
{
return numBins;
}
@Override
public Comparator<double[]> getComparator()
{
@ -106,6 +141,7 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
"name='" + name + '\'' +
", field=" + field +
", splitPoints=" + Arrays.toString(splitPoints) +
", numBins=" + numBins +
"}";
}
@ -125,7 +161,16 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
if (!Arrays.equals(splitPoints, that.splitPoints)) {
return false;
}
return field.equals(that.field);
if (!field.equals(that.field)) {
return false;
}
if (numBins == null && that.numBins == null) {
return true;
}
if (numBins != null && numBins.equals(that.numBins)) {
return true;
}
return false;
}
@Override
@ -133,6 +178,9 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
{
int hashCode = name.hashCode() * 31 + field.hashCode();
hashCode = hashCode * 31 + Arrays.hashCode(splitPoints);
if (numBins != null) {
hashCode = hashCode * 31 + numBins.hashCode();
}
return hashCode;
}
@ -141,9 +189,14 @@ public class DoublesSketchToHistogramPostAggregator implements PostAggregator
{
final CacheKeyBuilder builder = new CacheKeyBuilder(
AggregatorUtil.QUANTILES_DOUBLES_SKETCH_TO_HISTOGRAM_CACHE_TYPE_ID).appendCacheable(field);
if (splitPoints != null) {
for (final double value : splitPoints) {
builder.appendDouble(value);
}
}
if (numBins != null) {
builder.appendInt(numBins);
}
return builder.build();
}

View File

@ -33,12 +33,13 @@ public class DoublesSketchToHistogramOperatorConversion extends DoublesSketchLis
}
@Override
public PostAggregator makePostAgg(String name, PostAggregator field, double[] args)
public PostAggregator makePostAgg(String name, PostAggregator field, double[] points)
{
return new DoublesSketchToHistogramPostAggregator(
name,
field,
args
points,
null
);
}
}

View File

@ -19,6 +19,7 @@
package org.apache.druid.query.aggregation.datasketches.quantiles;
import nl.jqno.equalsverifier.EqualsVerifier;
import org.apache.druid.query.aggregation.Aggregator;
import org.apache.druid.query.aggregation.PostAggregator;
import org.apache.druid.query.aggregation.TestDoubleColumnSelectorImpl;
@ -43,7 +44,8 @@ public class DoublesSketchToHistogramPostAggregatorTest
final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator(
"histogram",
new FieldAccessPostAggregator("field", "sketch"),
new double[] {3.5}
new double[] {3.5},
null
);
final double[] histogram = (double[]) postAgg.compute(fields);
@ -54,7 +56,7 @@ public class DoublesSketchToHistogramPostAggregatorTest
}
@Test
public void normalCase()
public void splitPoints()
{
final double[] values = new double[] {1, 2, 3, 4, 5, 6};
final TestDoubleColumnSelectorImpl selector = new TestDoubleColumnSelectorImpl(values);
@ -72,7 +74,8 @@ public class DoublesSketchToHistogramPostAggregatorTest
final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator(
"histogram",
new FieldAccessPostAggregator("field", "sketch"),
new double[] {3.5} // splits distribution in two buckets of equal mass
new double[] {3.5}, // splits distribution into two bins of equal mass
null
);
final double[] histogram = (double[]) postAgg.compute(fields);
@ -81,4 +84,42 @@ public class DoublesSketchToHistogramPostAggregatorTest
Assert.assertEquals(3.0, histogram[0], 0);
Assert.assertEquals(3.0, histogram[1], 0);
}
@Test
public void numBins()
{
final double[] values = new double[] {1, 2, 3, 4, 5, 6};
final TestDoubleColumnSelectorImpl selector = new TestDoubleColumnSelectorImpl(values);
final Aggregator agg = new DoublesSketchBuildAggregator(selector, 8);
//noinspection ForLoopReplaceableByForEach
for (int i = 0; i < values.length; i++) {
agg.aggregate();
selector.increment();
}
final Map<String, Object> fields = new HashMap<>();
fields.put("sketch", agg.get());
final PostAggregator postAgg = new DoublesSketchToHistogramPostAggregator(
"histogram",
new FieldAccessPostAggregator("field", "sketch"),
null,
2 // two bins of equal mass
);
final double[] histogram = (double[]) postAgg.compute(fields);
Assert.assertNotNull(histogram);
Assert.assertEquals(2, histogram.length);
Assert.assertEquals(3.0, histogram[0], 0);
Assert.assertEquals(3.0, histogram[1], 0);
}
@Test
public void testEquals()
{
EqualsVerifier.forClass(DoublesSketchToHistogramPostAggregator.class)
.usingGetClass()
.verify();
}
}

View File

@ -655,7 +655,8 @@ public class DoublesSketchSqlAggregatorTest extends CalciteTestBase
"p12",
"a2:agg"
),
new double[]{0.2d, 0.6d}
new double[]{0.2d, 0.6d},
null
),
new DoublesSketchToRankPostAggregator(
"p15",