From 8cd7811955ddfc20b615180eef70d0b811a7b057 Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@gmail.com>
Date: Wed, 30 Apr 2014 10:51:09 +0200
Subject: [PATCH] Lower initial sizing of sub aggregations.

We currently compute initial sizings based on the cardinality of our fields.
This can be highly exagerated for sub aggregations, for example if there is a
parent terms aggregation that is executed over a field that has a very long
tail: most buckets will only collect a couple of documents.

Close #5994
---
 .../search/aggregations/Aggregator.java       | 13 +++++++++
 .../bucket/histogram/HistogramAggregator.java |  6 +++-
 .../SignificantTermsAggregatorFactory.java    | 27 ++---------------
 .../bucket/terms/TermsAggregatorFactory.java  | 29 ++++++++++---------
 4 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/src/main/java/org/elasticsearch/search/aggregations/Aggregator.java b/src/main/java/org/elasticsearch/search/aggregations/Aggregator.java
index 3303c42de01..d0393e2fd22 100644
--- a/src/main/java/org/elasticsearch/search/aggregations/Aggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/Aggregator.java
@@ -39,6 +39,19 @@ public abstract class Aggregator extends BucketCollector implements Releasable {
         }
     };
 
+    /**
+     * Returns whether any of the parent aggregators has {@link BucketAggregationMode#PER_BUCKET} as a bucket aggregation mode.
+     */
+    public static boolean hasParentBucketAggregator(Aggregator parent) {
+        if (parent == null) {
+            return false;
+        } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) {
+            return true;
+        } else {
+            return hasParentBucketAggregator(parent.parent());
+        }
+    }
+
     /**
      * Defines the nature of the aggregator's aggregation execution when nested in other aggregators and the buckets they create.
      */
diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
index 2fb5346f0bf..838b22b7174 100644
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
@@ -164,6 +164,10 @@ public class HistogramAggregator extends BucketsAggregator {
         @Override
         protected Aggregator create(ValuesSource.Numeric valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
             // todo if we'll keep track of min/max values in IndexFieldData, we could use the max here to come up with a better estimation for the buckets count
+            long estimatedBucketCount = 50;
+            if (hasParentBucketAggregator(parent)) {
+                estimatedBucketCount = 8;
+            }
 
             // we need to round the bounds given by the user and we have to do it for every aggregator we crate
             // as the rounding is not necessarily an idempotent operation.
@@ -174,7 +178,7 @@ public class HistogramAggregator extends BucketsAggregator {
                 extendedBounds.processAndValidate(name, aggregationContext.searchContext(), config.parser());
                 roundedBounds = extendedBounds.round(rounding);
             }
-            return new HistogramAggregator(name, factories, rounding, order, keyed, minDocCount, roundedBounds, valuesSource, config.formatter(), 50, histogramFactory, aggregationContext, parent);
+            return new HistogramAggregator(name, factories, rounding, order, keyed, minDocCount, roundedBounds, valuesSource, config.formatter(), estimatedBucketCount, histogramFactory, aggregationContext, parent);
         }
 
     }
diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
index 27cd3ff29bf..7f4442dcc04 100644
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
@@ -31,7 +31,7 @@ import org.elasticsearch.common.lucene.index.FilterableTermsEnum;
 import org.elasticsearch.common.lucene.index.FreqTermsEnum;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.search.aggregations.*;
-import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode;
+import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregatorFactory;
 import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
@@ -186,32 +186,11 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
         };
     }
 
-    private static boolean hasParentBucketAggregator(Aggregator parent) {
-        if (parent == null) {
-            return false;
-        } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) {
-            return true;
-        }
-        return hasParentBucketAggregator(parent.parent());
-    }
-
     @Override
     protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
         numberOfAggregatorsCreated++;
 
-        long estimatedBucketCount = valuesSource.metaData().maxAtomicUniqueValuesCount();
-        if (estimatedBucketCount < 0) {
-            // there isn't an estimation available.. 50 should be a good start
-            estimatedBucketCount = 50;
-        }
-
-        // adding an upper bound on the estimation as some atomic field data in the future (binary doc values) and not
-        // going to know their exact cardinality and will return upper bounds in AtomicFieldData.getNumberUniqueValues()
-        // that may be largely over-estimated.. the value chosen here is arbitrary just to play nice with typical CPU cache
-        //
-        // Another reason is that it may be faster to resize upon growth than to start directly with the appropriate size.
-        // And that all values are not necessarily visited by the matches.
-        estimatedBucketCount = Math.min(estimatedBucketCount, 512);
+        long estimatedBucketCount = TermsAggregatorFactory.estimatedBucketCount(valuesSource, parent);
 
         if (valuesSource instanceof ValuesSource.Bytes) {
             ExecutionMode execution = null;
@@ -224,7 +203,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                 execution = ExecutionMode.MAP;
             }
             if (execution == null) {
-                if (hasParentBucketAggregator(parent)) {
+                if (Aggregator.hasParentBucketAggregator(parent)) {
                     execution = ExecutionMode.GLOBAL_ORDINALS_HASH;
                 } else {
                     execution = ExecutionMode.GLOBAL_ORDINALS;
diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
index ba6afb53e3f..2a0834e3918 100644
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
@@ -22,7 +22,6 @@ import org.apache.lucene.search.IndexSearcher;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.search.aggregations.*;
-import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode;
 import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
@@ -182,18 +181,7 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
         };
     }
 
-    private static boolean hasParentBucketAggregator(Aggregator parent) {
-        if (parent == null) {
-            return false;
-        } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) {
-            return true;
-        } else {
-            return hasParentBucketAggregator(parent.parent());
-        }
-    }
-
-    @Override
-    protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
+    public static long estimatedBucketCount(ValuesSource valuesSource, Aggregator parent) {
         long estimatedBucketCount = valuesSource.metaData().maxAtomicUniqueValuesCount();
         if (estimatedBucketCount < 0) {
             // there isn't an estimation available.. 50 should be a good start
@@ -208,6 +196,19 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
         // And that all values are not necessarily visited by the matches.
         estimatedBucketCount = Math.min(estimatedBucketCount, 512);
 
+        if (Aggregator.hasParentBucketAggregator(parent)) {
+            // There is a parent that creates buckets, potentially with a very long tail of buckets with few documents
+            // Let's be conservative with memory in that case
+            estimatedBucketCount = Math.min(estimatedBucketCount, 8);
+        }
+
+        return estimatedBucketCount;
+    }
+
+    @Override
+    protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
+        long estimatedBucketCount = estimatedBucketCount(valuesSource, parent);
+
         if (valuesSource instanceof ValuesSource.Bytes) {
             ExecutionMode execution = null;
             if (executionHint != null) {
@@ -238,7 +239,7 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
                 // if there is a parent bucket aggregator the number of instances of this aggregator is going
                 // to be unbounded and most instances may only aggregate few documents, so use hashed based
                 // global ordinals to keep the bucket ords dense.
-                if (hasParentBucketAggregator(parent)) {
+                if (Aggregator.hasParentBucketAggregator(parent)) {
                     execution = ExecutionMode.GLOBAL_ORDINALS_HASH;
                 } else {
                     if (factories == AggregatorFactories.EMPTY) {