Lower initial sizing of sub aggregations.

We currently compute initial sizings based on the cardinality of our fields. This can be highly exagerated for sub aggregations, for example if there is a parent terms aggregation that is executed over a field that has a very long tail: most buckets will only collect a couple of documents. Close #5994
2014-04-30 10:51:09 +02:00 · 2014-04-30 10:51:09 +02:00 · 8cd7811955
parent 394a3e4332
commit 8cd7811955
4 changed files with 36 additions and 39 deletions
--- a/src/main/java/org/elasticsearch/search/aggregations/Aggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/Aggregator.java
@ -39,6 +39,19 @@ public abstract class Aggregator extends BucketCollector implements Releasable {
        }
    };

+    /**
+     * Returns whether any of the parent aggregators has {@link BucketAggregationMode#PER_BUCKET} as a bucket aggregation mode.
+     */
+    public static boolean hasParentBucketAggregator(Aggregator parent) {
+        if (parent == null) {
+            return false;
+        } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) {
+            return true;
+        } else {
+            return hasParentBucketAggregator(parent.parent());
+        }
+    }
+
    /**
     * Defines the nature of the aggregator's aggregation execution when nested in other aggregators and the buckets they create.
     */
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
@ -164,6 +164,10 @@ public class HistogramAggregator extends BucketsAggregator {
        @Override
        protected Aggregator create(ValuesSource.Numeric valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
            // todo if we'll keep track of min/max values in IndexFieldData, we could use the max here to come up with a better estimation for the buckets count
+            long estimatedBucketCount = 50;
+            if (hasParentBucketAggregator(parent)) {
+                estimatedBucketCount = 8;
+            }

            // we need to round the bounds given by the user and we have to do it for every aggregator we crate
            // as the rounding is not necessarily an idempotent operation.
@ -174,7 +178,7 @@ public class HistogramAggregator extends BucketsAggregator {
                extendedBounds.processAndValidate(name, aggregationContext.searchContext(), config.parser());
                roundedBounds = extendedBounds.round(rounding);
            }
-            return new HistogramAggregator(name, factories, rounding, order, keyed, minDocCount, roundedBounds, valuesSource, config.formatter(), 50, histogramFactory, aggregationContext, parent);
+            return new HistogramAggregator(name, factories, rounding, order, keyed, minDocCount, roundedBounds, valuesSource, config.formatter(), estimatedBucketCount, histogramFactory, aggregationContext, parent);
        }

    }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
@ -31,7 +31,7 @@ import org.elasticsearch.common.lucene.index.FilterableTermsEnum;
 import org.elasticsearch.common.lucene.index.FreqTermsEnum;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.search.aggregations.*;
-import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode;
+import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregatorFactory;
 import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
@ -186,32 +186,11 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
        };
    }

-    private static boolean hasParentBucketAggregator(Aggregator parent) {
-        if (parent == null) {
-            return false;
-        } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) {
-            return true;
-        }
-        return hasParentBucketAggregator(parent.parent());
-    }
-
    @Override
    protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
        numberOfAggregatorsCreated++;

-        long estimatedBucketCount = valuesSource.metaData().maxAtomicUniqueValuesCount();
-        if (estimatedBucketCount < 0) {
-            // there isn't an estimation available.. 50 should be a good start
-            estimatedBucketCount = 50;
-        }
-
-        // adding an upper bound on the estimation as some atomic field data in the future (binary doc values) and not
-        // going to know their exact cardinality and will return upper bounds in AtomicFieldData.getNumberUniqueValues()
-        // that may be largely over-estimated.. the value chosen here is arbitrary just to play nice with typical CPU cache
-        //
-        // Another reason is that it may be faster to resize upon growth than to start directly with the appropriate size.
-        // And that all values are not necessarily visited by the matches.
-        estimatedBucketCount = Math.min(estimatedBucketCount, 512);
+        long estimatedBucketCount = TermsAggregatorFactory.estimatedBucketCount(valuesSource, parent);

        if (valuesSource instanceof ValuesSource.Bytes) {
            ExecutionMode execution = null;
@ -224,7 +203,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                execution = ExecutionMode.MAP;
            }
            if (execution == null) {
-                if (hasParentBucketAggregator(parent)) {
+                if (Aggregator.hasParentBucketAggregator(parent)) {
                    execution = ExecutionMode.GLOBAL_ORDINALS_HASH;
                } else {
                    execution = ExecutionMode.GLOBAL_ORDINALS;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
@ -22,7 +22,6 @@ import org.apache.lucene.search.IndexSearcher;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.search.aggregations.*;
-import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode;
 import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
@ -182,18 +181,7 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
        };
    }

-    private static boolean hasParentBucketAggregator(Aggregator parent) {
-        if (parent == null) {
-            return false;
-        } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) {
-            return true;
-        } else {
-            return hasParentBucketAggregator(parent.parent());
-        }
-    }
-
-    @Override
-    protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
+    public static long estimatedBucketCount(ValuesSource valuesSource, Aggregator parent) {
        long estimatedBucketCount = valuesSource.metaData().maxAtomicUniqueValuesCount();
        if (estimatedBucketCount < 0) {
            // there isn't an estimation available.. 50 should be a good start
@ -208,6 +196,19 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
        // And that all values are not necessarily visited by the matches.
        estimatedBucketCount = Math.min(estimatedBucketCount, 512);

+        if (Aggregator.hasParentBucketAggregator(parent)) {
+            // There is a parent that creates buckets, potentially with a very long tail of buckets with few documents
+            // Let's be conservative with memory in that case
+            estimatedBucketCount = Math.min(estimatedBucketCount, 8);
+        }
+
+        return estimatedBucketCount;
+    }
+
+    @Override
+    protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
+        long estimatedBucketCount = estimatedBucketCount(valuesSource, parent);
+
        if (valuesSource instanceof ValuesSource.Bytes) {
            ExecutionMode execution = null;
            if (executionHint != null) {
@ -238,7 +239,7 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
                // if there is a parent bucket aggregator the number of instances of this aggregator is going
                // to be unbounded and most instances may only aggregate few documents, so use hashed based
                // global ordinals to keep the bucket ords dense.
-                if (hasParentBucketAggregator(parent)) {
+                if (Aggregator.hasParentBucketAggregator(parent)) {
                    execution = ExecutionMode.GLOBAL_ORDINALS_HASH;
                } else {
                    if (factories == AggregatorFactories.EMPTY) {