Aggregations enhancement - remove pointless term frequency lookups.

If the user has set a shard_min_doc_count setting then avoid looking up background frequencies if the term fails to meet the foreground threshold on a shard. Closes #11093
2015-05-11 16:18:59 +01:00 · 2015-05-11 16:18:59 +01:00 · 89b95dccc8
parent 236f6ccad7
commit 89b95dccc8
3 changed files with 20 additions and 13 deletions
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
@ -99,6 +99,10 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
            if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) {
                continue;
            }
+            if (bucketDocCount < bucketCountThresholds.getShardMinDocCount()) {
+                continue;
+            }
+
            if (spare == null) {
                spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null);
            }
@ -113,9 +117,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
            // Back at the central reducer these properties will be updated with
            // global stats
            spare.updateScore(termsAggFactory.getSignificanceHeuristic());
-            if (spare.subsetDf >= bucketCountThresholds.getShardMinDocCount()) {
-                spare = (SignificantStringTerms.Bucket) ordered.insertWithOverflow(spare);
-            }
+            spare = (SignificantStringTerms.Bucket) ordered.insertWithOverflow(spare);
        }

        final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()];
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java
@ -24,8 +24,8 @@ import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.lease.Releasables;
 import org.elasticsearch.search.aggregations.Aggregator;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
-import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
 import org.elasticsearch.search.aggregations.LeafBucketCollector;
+import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
 import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator;
 import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.reducers.Reducer;
@ -82,11 +82,15 @@ public class SignificantLongTermsAggregator extends LongTermsAggregator {
        BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);
        SignificantLongTerms.Bucket spare = null;
        for (long i = 0; i < bucketOrds.size(); i++) {
+            final int docCount = bucketDocCount(i);
+            if (docCount < bucketCountThresholds.getShardMinDocCount()) {
+                continue;
+            }
            if (spare == null) {
                spare = new SignificantLongTerms.Bucket(0, 0, 0, 0, 0, null, formatter);
            }
            spare.term = bucketOrds.get(i);
-            spare.subsetDf = bucketDocCount(i);
+            spare.subsetDf = docCount;
            spare.subsetSize = subsetSize;
            spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.term);
            spare.supersetSize = supersetSize;
@ -95,9 +99,7 @@ public class SignificantLongTermsAggregator extends LongTermsAggregator {
            spare.updateScore(termsAggFactory.getSignificanceHeuristic());

            spare.bucketOrd = i;
-            if (spare.subsetDf >= bucketCountThresholds.getShardMinDocCount()) {
-                spare = (SignificantLongTerms.Bucket) ordered.insertWithOverflow(spare);
-            }
+            spare = (SignificantLongTerms.Bucket) ordered.insertWithOverflow(spare);
        }

        final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()];
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@ -24,8 +24,8 @@ import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.lease.Releasables;
 import org.elasticsearch.search.aggregations.Aggregator;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
-import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
 import org.elasticsearch.search.aggregations.LeafBucketCollector;
+import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
 import org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator;
 import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.reducers.Reducer;
@ -81,12 +81,17 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
        BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);
        SignificantStringTerms.Bucket spare = null;
        for (int i = 0; i < bucketOrds.size(); i++) {
+            final int docCount = bucketDocCount(i);
+            if (docCount < bucketCountThresholds.getShardMinDocCount()) {
+                continue;
+            }
+
            if (spare == null) {
                spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null);
            }

            bucketOrds.get(i, spare.termBytes);
-            spare.subsetDf = bucketDocCount(i);
+            spare.subsetDf = docCount;
            spare.subsetSize = subsetSize;
            spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
            spare.supersetSize = supersetSize;
@ -97,9 +102,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
            spare.updateScore(termsAggFactory.getSignificanceHeuristic());

            spare.bucketOrd = i;
-            if (spare.subsetDf >= bucketCountThresholds.getShardMinDocCount()) {
-                spare = (SignificantStringTerms.Bucket) ordered.insertWithOverflow(spare);
-            }
+            spare = (SignificantStringTerms.Bucket) ordered.insertWithOverflow(spare);
        }

        final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()];