From ad232aebbea681a9fa910f0da29e38b29a44c62c Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 8 Jun 2016 12:46:11 +0200 Subject: [PATCH] Set collection mode to breadth_first in the terms aggregation when the cardinality of the field is unknown or smaller than the requested size. closes #9825 --- .../bucket/BestBucketsDeferringCollector.java | 14 +++--- .../bucket/terms/TermsAggregationBuilder.java | 10 +++-- .../bucket/terms/TermsAggregatorFactory.java | 38 ++++++++++++++-- .../terms/TermsAggregatorFactoryTests.java | 44 +++++++++++++++++++ .../bucket/terms-aggregation.asciidoc | 18 ++++---- 5 files changed, 101 insertions(+), 23 deletions(-) create mode 100644 core/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactoryTests.java diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/BestBucketsDeferringCollector.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/BestBucketsDeferringCollector.java index 43cefdca290..4dba4000da6 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/BestBucketsDeferringCollector.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/BestBucketsDeferringCollector.java @@ -166,16 +166,16 @@ public class BestBucketsDeferringCollector extends DeferringBucketCollector { int doc = 0; for (long i = 0, end = entry.docDeltas.size(); i < end; ++i) { doc += docDeltaIterator.next(); - if (needsScores) { - if (docIt.docID() < doc) { - docIt.advance(doc); - } - // aggregations should only be replayed on matching documents - assert docIt.docID() == doc; - } final long bucket = buckets.next(); final long rebasedBucket = hash.find(bucket); if (rebasedBucket != -1) { + if (needsScores) { + if (docIt.docID() < doc) { + docIt.advance(doc); + } + // aggregations should only be replayed on matching documents + assert docIt.docID() == doc; + } leafCollector.collect(doc, rebasedBucket); } } diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregationBuilder.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregationBuilder.java index 1dd59edf6d9..87567d64df4 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregationBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregationBuilder.java @@ -56,7 +56,7 @@ public class TermsAggregationBuilder extends ValuesSourceAggregationBuilder expectedSize) { + // use breadth_first if the cardinality is bigger than the expected size or unknown (-1) + return SubAggCollectionMode.BREADTH_FIRST; + } + return SubAggCollectionMode.DEPTH_FIRST; + } + public enum ExecutionMode { MAP(new ParseField("map")) { diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactoryTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactoryTests.java new file mode 100644 index 00000000000..7d7ffac4fb3 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactoryTests.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.test.ESSingleNodeTestCase; + +import static org.hamcrest.Matchers.equalTo; + +public class TermsAggregatorFactoryTests extends ESSingleNodeTestCase { + public void testSubAggCollectMode() throws Exception { + assertThat(TermsAggregatorFactory.subAggCollectionMode(Integer.MAX_VALUE, -1), + equalTo(Aggregator.SubAggCollectionMode.DEPTH_FIRST)); + assertThat(TermsAggregatorFactory.subAggCollectionMode(10, -1), + equalTo(Aggregator.SubAggCollectionMode.BREADTH_FIRST)); + assertThat(TermsAggregatorFactory.subAggCollectionMode(10, 5), + equalTo(Aggregator.SubAggCollectionMode.DEPTH_FIRST)); + assertThat(TermsAggregatorFactory.subAggCollectionMode(10, 10), + equalTo(Aggregator.SubAggCollectionMode.DEPTH_FIRST)); + assertThat(TermsAggregatorFactory.subAggCollectionMode(10, 100), + equalTo(Aggregator.SubAggCollectionMode.BREADTH_FIRST)); + assertThat(TermsAggregatorFactory.subAggCollectionMode(1, 2), + equalTo(Aggregator.SubAggCollectionMode.BREADTH_FIRST)); + assertThat(TermsAggregatorFactory.subAggCollectionMode(1, 100), + equalTo(Aggregator.SubAggCollectionMode.BREADTH_FIRST)); + } +} diff --git a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc index 1c88f0a06d8..3c1f4ae860a 100644 --- a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc @@ -577,7 +577,8 @@ Deferring calculation of child aggregations For fields with many unique terms and a small number of required results it can be more efficient to delay the calculation of child aggregations until the top parent-level aggs have been pruned. Ordinarily, all branches of the aggregation tree -are expanded in one depth-first pass and only then any pruning occurs. In some rare scenarios this can be very wasteful and can hit memory constraints. +are expanded in one depth-first pass and only then any pruning occurs. +In some scenarios this can be very wasteful and can hit memory constraints. An example problem scenario is querying a movie database for the 10 most popular actors and their 5 most common co-stars: [source,js] @@ -602,10 +603,13 @@ An example problem scenario is querying a movie database for the 10 most popular } -------------------------------------------------- -Even though the number of movies may be comparatively small and we want only 50 result buckets there is a combinatorial explosion of buckets -during calculation - a single movie will produce n² buckets where n is the number of actors. The sane option would be to first determine +Even though the number of actors may be comparatively small and we want only 50 result buckets there is a combinatorial explosion of buckets +during calculation - a single actor can produce n² buckets where n is the number of actors. The sane option would be to first determine the 10 most popular actors and only then examine the top co-stars for these 10 actors. This alternative strategy is what we call the `breadth_first` collection -mode as opposed to the default `depth_first` mode: +mode as opposed to the `depth_first` mode. + +NOTE: The `breadth_first` is the default mode for fields with a cardinality bigger than the requested size or when the cardinality is unknown (numeric fields or scripts for instance). +It is possible to override the default heuristic and to provide a collect mode directly in the request: [source,js] -------------------------------------------------- @@ -615,7 +619,7 @@ mode as opposed to the default `depth_first` mode: "terms" : { "field" : "actors", "size" : 10, - "collect_mode" : "breadth_first" + "collect_mode" : "breadth_first" <1> }, "aggs" : { "costars" : { @@ -630,12 +634,10 @@ mode as opposed to the default `depth_first` mode: } -------------------------------------------------- +<1> the possible values are `breadth_first` and `depth_first` When using `breadth_first` mode the set of documents that fall into the uppermost buckets are cached for subsequent replay so there is a memory overhead in doing this which is linear with the number of matching documents. -In most requests the volume of buckets generated is smaller than the number of documents that fall into them so the default `depth_first` -collection mode is normally the best bet but occasionally the `breadth_first` strategy can be significantly more efficient. Currently -elasticsearch will always use the `depth_first` collect_mode unless explicitly instructed to use `breadth_first` as in the above example. Note that the `order` parameter can still be used to refer to data from a child aggregation when using the `breadth_first` setting - the parent aggregation understands that this child aggregation will need to be called first before any of the other child aggregations.