From c08daf25892c12f7219a097e8b9290dff280c047 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 27 Apr 2018 15:26:46 +0200 Subject: [PATCH] Build global ordinals terms bucket from matching ordinals (#30166) The global ordinals terms aggregator has an option to remap global ordinals to dense ordinal that match the request. This mode is automatically picked when the terms aggregator is a child of another bucket aggregator or when it needs to defer buckets to an aggregation that is used in the ordering of the terms. Though when building the final buckets, this aggregator loops over all possible global ordinals rather than using the hash map that was built to remap the ordinals. For fields with high cardinality this is highly inefficient and can lead to slow responses even when the number of terms that match the query is low. This change fixes this performance issue by using the hash table of matching ordinals to perform the pruning of the final buckets for the terms and significant_terms aggregation. I ran a simple benchmark with 1M documents containing 0 to 10 keywords randomly selected among 1M unique terms. This field is used to perform a multi-level terms aggregation using rally to collect the response times. The aggregation below is an example of a two-level terms aggregation that was used to perform the benchmark: ``` "aggregations":{ "1":{ "terms":{ "field":"keyword" }, "aggregations":{ "2":{ "terms":{ "field":"keyword" } } } } } ``` | Levels of aggregation | 50th percentile ms (master) | 50th percentile ms (patch) | | --- | --- | --- | | 2 | 640.41ms | 577.499ms | | 3 | 2239.66ms | 600.154ms | | 4 | 14141.2ms | 703.512ms | Closes #30117 --- ...balOrdinalsSignificantTermsAggregator.java | 21 ++++++++++----- .../GlobalOrdinalsStringTermsAggregator.java | 27 ++++++++++++------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java index 66b8f8d5b15..25f83caa3eb 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java @@ -20,10 +20,8 @@ package org.elasticsearch.search.aggregations.bucket.significant; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.lease.Releasables; -import org.elasticsearch.common.util.LongHash; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; @@ -103,11 +101,22 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue<>(size); SignificantStringTerms.Bucket spare = null; - for (long globalTermOrd = 0; globalTermOrd < valueCount; ++globalTermOrd) { - if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) { + final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0; + final long maxId = needsFullScan ? valueCount : bucketOrds.size(); + for (long ord = 0; ord < maxId; ord++) { + final long globalOrd; + final long bucketOrd; + if (needsFullScan) { + bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord); + globalOrd = ord; + } else { + assert bucketOrds != null; + bucketOrd = ord; + globalOrd = bucketOrds.get(ord); + } + if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) { continue; } - final long bucketOrd = getBucketOrd(globalTermOrd); final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd); if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) { continue; @@ -120,7 +129,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format); } spare.bucketOrd = bucketOrd; - copy(lookupGlobalOrd.apply(globalTermOrd), spare.termBytes); + copy(lookupGlobalOrd.apply(globalOrd), spare.termBytes); spare.subsetDf = bucketDocCount; spare.subsetSize = subsetSize; spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 6ad14b8d0f9..03eb00337e9 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr protected final long valueCount; protected final GlobalOrdLookupFunction lookupGlobalOrd; - private final LongHash bucketOrds; + protected final LongHash bucketOrds; public interface GlobalOrdLookupFunction { BytesRef apply(long ord) throws IOException; @@ -107,10 +107,6 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr return bucketOrds != null; } - protected final long getBucketOrd(long globalOrd) { - return bucketOrds == null ? globalOrd : bucketOrds.find(globalOrd); - } - private void collectGlobalOrd(int doc, long globalOrd, LeafBucketCollector sub) throws IOException { if (bucketOrds == null) { collectExistingBucket(sub, doc, globalOrd); @@ -188,17 +184,28 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr long otherDocCount = 0; BucketPriorityQueue ordered = new BucketPriorityQueue<>(size, order.comparator(this)); OrdBucket spare = new OrdBucket(-1, 0, null, showTermDocCountError, 0); - for (long globalTermOrd = 0; globalTermOrd < valueCount; ++globalTermOrd) { - if (includeExclude != null && !acceptedGlobalOrdinals.get(globalTermOrd)) { + final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0; + final long maxId = needsFullScan ? valueCount : bucketOrds.size(); + for (long ord = 0; ord < maxId; ord++) { + final long globalOrd; + final long bucketOrd; + if (needsFullScan) { + bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord); + globalOrd = ord; + } else { + assert bucketOrds != null; + bucketOrd = ord; + globalOrd = bucketOrds.get(ord); + } + if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) { continue; } - final long bucketOrd = getBucketOrd(globalTermOrd); final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd); if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) { continue; } otherDocCount += bucketDocCount; - spare.globalOrd = globalTermOrd; + spare.globalOrd = globalOrd; spare.bucketOrd = bucketOrd; spare.docCount = bucketDocCount; if (bucketCountThresholds.getShardMinDocCount() <= spare.docCount) { @@ -378,7 +385,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr } final long ord = i - 1; // remember we do +1 when counting final long globalOrd = mapping.applyAsLong(ord); - long bucketOrd = getBucketOrd(globalOrd); + long bucketOrd = bucketOrds == null ? globalOrd : bucketOrds.find(globalOrd); incrementBucketDocCount(bucketOrd, inc); } }