Add histogram field type support to boxplot aggs (#52265)

Add support for the histogram field type to boxplot aggs.

Closes #52233
Relates to #33112
This commit is contained in:
Igor Motov 2020-02-13 18:09:26 -05:00 committed by GitHub
parent c61124a7b9
commit a66988281f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 76 additions and 27 deletions

View File

@ -45,6 +45,7 @@ include::metrics/valuecount-aggregation.asciidoc[]
include::metrics/median-absolute-deviation-aggregation.asciidoc[]
include::metrics/boxplot-aggregation.asciidoc[]

View File

@ -4,7 +4,8 @@
=== Boxplot Aggregation
A `boxplot` metrics aggregation that computes boxplot of numeric values extracted from the aggregated documents.
These values can be extracted either from specific numeric fields in the documents, or be generated by a provided script.
These values can be generated by a provided script or extracted from specific numeric or
<<histogram,histogram fields>> in the documents.
The `boxplot` aggregation returns essential information for making a https://en.wikipedia.org/wiki/Box_plot[box plot]: minimum, maximum
median, first quartile (25th percentile) and third quartile (75th percentile) values.

View File

@ -285,7 +285,7 @@ GET latency/_search
<1> Compression controls memory usage and approximation error
// tag::[t-digest]
// tag::t-digest[]
The TDigest algorithm uses a number of "nodes" to approximate percentiles -- the
more nodes available, the higher the accuracy (and large memory footprint) proportional
to the volume of data. The `compression` parameter limits the maximum number of
@ -301,7 +301,7 @@ A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large a
of data which arrives sorted and in-order) the default settings will produce a
TDigest roughly 64KB in size. In practice data tends to be more random and
the TDigest will use less memory.
// tag::[t-digest]
// end::t-digest[]
==== HDR Histogram

View File

@ -37,6 +37,7 @@ following aggregations and queries:
* <<search-aggregations-metrics-percentile-aggregation,percentiles>> aggregation
* <<search-aggregations-metrics-percentile-rank-aggregation,percentile ranks>> aggregation
* <<search-aggregations-metrics-boxplot-aggregation,boxplot>> aggregation
* <<query-dsl-exists-query,exists>> query
[[mapping-types-histogram-building-histogram]]

View File

@ -29,7 +29,7 @@ import java.util.Objects;
import static org.elasticsearch.search.aggregations.metrics.PercentilesAggregationBuilder.COMPRESSION_FIELD;
public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.LeafOnly<ValuesSource.Numeric,
public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.LeafOnly<ValuesSource,
BoxplotAggregationBuilder> {
public static final String NAME = "boxplot";
@ -37,7 +37,7 @@ public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.Le
static {
PARSER = new ObjectParser<>(BoxplotAggregationBuilder.NAME);
ValuesSourceParserHelper.declareNumericFields(PARSER, true, true, false);
ValuesSourceParserHelper.declareAnyFields(PARSER, true, true);
PARSER.declareDouble(BoxplotAggregationBuilder::compression, COMPRESSION_FIELD);
}
@ -98,7 +98,7 @@ public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.Le
@Override
protected BoxplotAggregatorFactory innerBuild(QueryShardContext queryShardContext,
ValuesSourceConfig<ValuesSource.Numeric> config,
ValuesSourceConfig<ValuesSource> config,
AggregatorFactory parent,
AggregatorFactories.Builder subFactoriesBuilder) throws IOException {
return new BoxplotAggregatorFactory(name, config, compression, queryShardContext, parent, subFactoriesBuilder, metaData);

View File

@ -11,6 +11,8 @@ import org.apache.lucene.search.ScoreMode;
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.ObjectArray;
import org.elasticsearch.index.fielddata.HistogramValue;
import org.elasticsearch.index.fielddata.HistogramValues;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
@ -29,12 +31,12 @@ import java.util.Map;
public class BoxplotAggregator extends NumericMetricsAggregator.MultiValue {
private final ValuesSource.Numeric valuesSource;
private final ValuesSource valuesSource;
private final DocValueFormat format;
protected ObjectArray<TDigestState> states;
protected final double compression;
BoxplotAggregator(String name, ValuesSource.Numeric valuesSource, DocValueFormat formatter, double compression,
BoxplotAggregator(String name, ValuesSource valuesSource, DocValueFormat formatter, double compression,
SearchContext context, Aggregator parent, List<PipelineAggregator> pipelineAggregators,
Map<String, Object> metaData) throws IOException {
super(name, context, parent, pipelineAggregators, metaData);
@ -58,23 +60,38 @@ public class BoxplotAggregator extends NumericMetricsAggregator.MultiValue {
return LeafBucketCollector.NO_OP_COLLECTOR;
}
final BigArrays bigArrays = context.bigArrays();
final SortedNumericDoubleValues values = valuesSource.doubleValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
states = bigArrays.grow(states, bucket + 1);
if (values.advanceExact(doc)) {
if (valuesSource instanceof ValuesSource.Histogram) {
final HistogramValues values = ((ValuesSource.Histogram)valuesSource).getHistogramValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
TDigestState state = getExistingOrNewHistogram(bigArrays, bucket);
if (values.advanceExact(doc)) {
final int valueCount = values.docValueCount();
for (int i = 0; i < valueCount; i++) {
state.add(values.nextValue());
final HistogramValue sketch = values.histogram();
while(sketch.next()) {
state.add(sketch.value(), sketch.count());
}
}
}
}
};
};
} else {
final SortedNumericDoubleValues values = ((ValuesSource.Numeric)valuesSource).doubleValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
states = bigArrays.grow(states, bucket + 1);
if (values.advanceExact(doc)) {
TDigestState state = getExistingOrNewHistogram(bigArrays, bucket);
if (values.advanceExact(doc)) {
final int valueCount = values.docValueCount();
for (int i = 0; i < valueCount; i++) {
state.add(values.nextValue());
}
}
}
}
};
}
}
private TDigestState getExistingOrNewHistogram(final BigArrays bigArrays, long bucket) {

View File

@ -20,12 +20,12 @@ import java.io.IOException;
import java.util.List;
import java.util.Map;
public class BoxplotAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource.Numeric> {
public class BoxplotAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource> {
private final double compression;
BoxplotAggregatorFactory(String name,
ValuesSourceConfig<ValuesSource.Numeric> config,
ValuesSourceConfig<ValuesSource> config,
double compression,
QueryShardContext queryShardContext,
AggregatorFactory parent,
@ -46,7 +46,7 @@ public class BoxplotAggregatorFactory extends ValuesSourceAggregatorFactory<Valu
}
@Override
protected Aggregator doCreateInternal(ValuesSource.Numeric valuesSource,
protected Aggregator doCreateInternal(ValuesSource valuesSource,
SearchContext searchContext,
Aggregator parent,
boolean collectsFromSingleBucket,

View File

@ -27,6 +27,8 @@ import org.elasticsearch.search.aggregations.metrics.PercentilesMethod;
import org.elasticsearch.search.aggregations.metrics.TDigestState;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.xpack.analytics.AnalyticsPlugin;
import org.elasticsearch.xpack.analytics.boxplot.Boxplot;
import org.elasticsearch.xpack.analytics.boxplot.BoxplotAggregationBuilder;
import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
import java.util.ArrayList;
@ -131,8 +133,7 @@ public class HistogramPercentileAggregationTests extends ESSingleNodeTestCase {
}
}
public void testTDigestHistogram() throws Exception {
private void setupTDigestHistogram(int compression) throws Exception {
XContentBuilder xContentBuilder = XContentFactory.jsonBuilder()
.startObject()
.startObject("_doc")
@ -170,8 +171,6 @@ public class HistogramPercentileAggregationTests extends ESSingleNodeTestCase {
PutMappingRequest request2 = new PutMappingRequest("pre_agg").type("_doc").source(xContentBuilder2);
client().admin().indices().putMapping(request2).actionGet();
int compression = TestUtil.nextInt(random(), 200, 300);
TDigestState histogram = new TDigestState(compression);
BulkRequest bulkRequest = new BulkRequest();
@ -218,6 +217,11 @@ public class HistogramPercentileAggregationTests extends ESSingleNodeTestCase {
response = client().prepareSearch("pre_agg").get();
assertEquals(numDocs / frq, response.getHits().getTotalHits().value);
}
public void testTDigestHistogram() throws Exception {
int compression = TestUtil.nextInt(random(), 200, 300);
setupTDigestHistogram(compression);
PercentilesAggregationBuilder builder =
AggregationBuilders.percentiles("agg").field("inner.data").method(PercentilesMethod.TDIGEST)
@ -236,6 +240,31 @@ public class HistogramPercentileAggregationTests extends ESSingleNodeTestCase {
}
}
public void testBoxplotHistogram() throws Exception {
int compression = TestUtil.nextInt(random(), 200, 300);
setupTDigestHistogram(compression);
BoxplotAggregationBuilder bpBuilder = new BoxplotAggregationBuilder("agg").field("inner.data").compression(compression);
SearchResponse bpResponseRaw = client().prepareSearch("raw").addAggregation(bpBuilder).get();
SearchResponse bpResponsePreAgg = client().prepareSearch("pre_agg").addAggregation(bpBuilder).get();
SearchResponse bpResponseBoth = client().prepareSearch("raw", "pre_agg").addAggregation(bpBuilder).get();
Boxplot bpRaw = bpResponseRaw.getAggregations().get("agg");
Boxplot bpPreAgg = bpResponsePreAgg.getAggregations().get("agg");
Boxplot bpBoth = bpResponseBoth.getAggregations().get("agg");
assertEquals(bpRaw.getMax(), bpPreAgg.getMax(), 0.0);
assertEquals(bpRaw.getMax(), bpBoth.getMax(), 0.0);
assertEquals(bpRaw.getMin(), bpPreAgg.getMin(), 0.0);
assertEquals(bpRaw.getMin(), bpBoth.getMin(), 0.0);
assertEquals(bpRaw.getQ1(), bpPreAgg.getQ1(), 1.0);
assertEquals(bpRaw.getQ1(), bpBoth.getQ1(), 1.0);
assertEquals(bpRaw.getQ2(), bpPreAgg.getQ2(), 1.0);
assertEquals(bpRaw.getQ2(), bpBoth.getQ2(), 1.0);
assertEquals(bpRaw.getQ3(), bpPreAgg.getQ3(), 1.0);
assertEquals(bpRaw.getQ3(), bpBoth.getQ3(), 1.0);
}
@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
List<Class<? extends Plugin>> plugins = new ArrayList<>(super.getPlugins());