Make significant terms work on fields that are indexed with points. #18031

It will keep using the caching terms enum for keyword/text fields and falls back to IndexSearcher.count for fields that do not use the inverted index for searching (such as numbers and ip addresses). Note that this probably means that significant terms aggregations on these fields will be less efficient than they used to be. It should be ok under a sampler aggregation though. This moves tests back to the state they were in before numbers started using points, and also adds a new test that significant terms aggs fail if a field is not indexed. In the long term, we might want to follow the approach that Robert initially proposed that consists in collecting all documents from the background filter in order to compute frequencies using doc values. This would also mean that significant terms aggregations do not require fields to be indexed anymore.
2025-03-25 01:19:02 +00:00 · 2016-04-27 20:59:28 +02:00 · 2016-04-27 20:59:28 +02:00 · 866a5459f0
commit 866a5459f0
parent e4edee5d9a
9 changed files with 111 additions and 89 deletions
--- a/core/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java
+++ b/core/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java
@ -66,18 +66,12 @@ public class FilterableTermsEnum extends TermsEnum {
    protected long currentTotalTermFreq = 0;
    protected BytesRef current;
    protected final int docsEnumFlag;
-    protected int numDocs;

    public FilterableTermsEnum(IndexReader reader, String field, int docsEnumFlag, @Nullable Query filter) throws IOException {
        if ((docsEnumFlag != PostingsEnum.FREQS) && (docsEnumFlag != PostingsEnum.NONE)) {
            throw new IllegalArgumentException("invalid docsEnumFlag of " + docsEnumFlag);
        }
        this.docsEnumFlag = docsEnumFlag;
-        if (filter == null) {
-            // Important - need to use the doc count that includes deleted docs
-            // or we have this issue: https://github.com/elastic/elasticsearch/issues/7951
-            numDocs = reader.maxDoc();
-        }
        List<LeafReaderContext> leaves = reader.leaves();
        List<Holder> enums = new ArrayList<>(leaves.size());
        final Weight weight;
@ -118,20 +112,12 @@ public class FilterableTermsEnum extends TermsEnum {
                }

                bits = BitSet.of(docs, context.reader().maxDoc());
-
-                // Count how many docs are in our filtered set
-                // TODO make this lazy-loaded only for those that need it?
-                numDocs += bits.cardinality();
            }
            enums.add(new Holder(termsEnum, bits));
        }
        this.enums = enums.toArray(new Holder[enums.size()]);
    }

-    public int getNumDocs() {
-        return numDocs;
-    }
-
    @Override
    public BytesRef term() throws IOException {
        return current;
--- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
+++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
@ -91,7 +91,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
        } else {
            size = (int) Math.min(maxBucketOrd(), bucketCountThresholds.getShardSize());
        }
-        long supersetSize = termsAggFactory.prepareBackground(context);
+        long supersetSize = termsAggFactory.getSupersetNumDocs();
        long subsetSize = numCollectedDocs;

        BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);
--- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java
+++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java
@ -79,7 +79,7 @@ public class SignificantLongTermsAggregator extends LongTermsAggregator {

        final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize());

-        long supersetSize = termsAggFactory.prepareBackground(context);
+        long supersetSize = termsAggFactory.getSupersetNumDocs();
        long subsetSize = numCollectedDocs;

        BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);
--- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
+++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@ -78,7 +78,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
        assert owningBucketOrdinal == 0;

        final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize());
-        long supersetSize = termsAggFactory.prepareBackground(context);
+        long supersetSize = termsAggFactory.getSupersetNumDocs();
        long subsetSize = numCollectedDocs;

        BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);
--- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
+++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
@ -19,12 +19,15 @@

 package org.elasticsearch.search.aggregations.bucket.significant;

+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.ParseFieldMatcher;
 import org.elasticsearch.common.lease.Releasable;
@ -65,7 +68,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
    private MappedFieldType fieldType;
    private FilterableTermsEnum termsEnum;
    private int numberOfAggregatorsCreated;
-    private final QueryBuilder filterBuilder;
+    private final Query filter;
+    private final int supersetNumDocs;
    private final TermsAggregator.BucketCountThresholds bucketCountThresholds;
    private final SignificanceHeuristic significanceHeuristic;

@ -76,7 +80,15 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
        super(name, type, config, context, parent, subFactoriesBuilder, metaData);
        this.includeExclude = includeExclude;
        this.executionHint = executionHint;
-        this.filterBuilder = filterBuilder;
+        this.filter = filterBuilder == null
+                ? null
+                : filterBuilder.toQuery(context.searchContext().getQueryShardContext());
+        IndexSearcher searcher = context.searchContext().searcher();
+        this.supersetNumDocs = filter == null
+                // Important - need to use the doc count that includes deleted docs
+                // or we have this issue: https://github.com/elastic/elasticsearch/issues/7951
+                ? searcher.getIndexReader().maxDoc()
+                : searcher.count(filter);
        this.bucketCountThresholds = bucketCountThresholds;
        this.significanceHeuristic = significanceHeuristic;
        this.significanceHeuristic.initialize(context.searchContext());
@ -92,65 +104,56 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
    }

    /**
-     * Creates the TermsEnum (if not already created) and must be called before
-     * any calls to getBackgroundFrequency
-     *
-     * @param context
-     *            The aggregation context
-     * @return The number of documents in the index (after an optional filter
-     *         might have been applied)
+     * Get the number of docs in the superset.
     */
-    public long prepareBackground(AggregationContext context) {
+    public long getSupersetNumDocs() {
+        return supersetNumDocs;
+    }
+
+    private FilterableTermsEnum getTermsEnum(String field) throws IOException {
        if (termsEnum != null) {
-            // already prepared - return
-            return termsEnum.getNumDocs();
+            return termsEnum;
        }
-        SearchContext searchContext = context.searchContext();
-        IndexReader reader = searchContext.searcher().getIndexReader();
-        Query filter = null;
-        try {
-            if (filterBuilder != null) {
-                filter = filterBuilder.toFilter(context.searchContext().getQueryShardContext());
-            }
-        } catch (IOException e) {
-            throw new ElasticsearchException("failed to create filter: " + filterBuilder.toString(), e);
+        IndexReader reader = context.searchContext().searcher().getIndexReader();
+        if (numberOfAggregatorsCreated > 1) {
+            termsEnum = new FreqTermsEnum(reader, field, true, false, filter, context.bigArrays());
+        } else {
+            termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
        }
-        try {
-            if (numberOfAggregatorsCreated == 1) {
-                // Setup a termsEnum for sole use by one aggregator
-                termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
+        return termsEnum;
+    }
+
+    private long getBackgroundFrequency(String value) throws IOException {
+        Query query = fieldType.termQuery(value, context.searchContext().getQueryShardContext());
+        if (query instanceof TermQuery) {
+            // for types that use the inverted index, we prefer using a caching terms
+            // enum that will do a better job at reusing index inputs
+            Term term = ((TermQuery) query).getTerm();
+            FilterableTermsEnum termsEnum = getTermsEnum(term.field());
+            if (termsEnum.seekExact(term.bytes())) {
+                return termsEnum.docFreq();
            } else {
-                // When we have > 1 agg we have possibility of duplicate term
-                // frequency lookups
-                // and so use a TermsEnum that caches results of all term
-                // lookups
-                termsEnum = new FreqTermsEnum(reader, indexedFieldName, true, false, filter, searchContext.bigArrays());
+                return 0;
            }
-        } catch (IOException e) {
-            throw new ElasticsearchException("failed to build terms enumeration", e);
        }
-        return termsEnum.getNumDocs();
+        // otherwise do it the naive way
+        if (filter != null) {
+            query = new BooleanQuery.Builder()
+                    .add(query, Occur.FILTER)
+                    .add(filter, Occur.FILTER)
+                    .build();
+        }
+        return context.searchContext().searcher().count(query);
    }

-    public long getBackgroundFrequency(BytesRef termBytes) {
-        assert termsEnum != null; // having failed to find a field in the index
-                                  // we don't expect any calls for frequencies
-        long result = 0;
-        try {
-            if (termsEnum.seekExact(termBytes)) {
-                result = termsEnum.docFreq();
-            }
-        } catch (IOException e) {
-            throw new ElasticsearchException("IOException loading background document frequency info", e);
-        }
-        return result;
+    public long getBackgroundFrequency(BytesRef termBytes) throws IOException {
+        String value = config.format().format(termBytes);
+        return getBackgroundFrequency(value);
    }

-    public long getBackgroundFrequency(long value) {
-        Query query = fieldType.termQuery(value, null);
-        Term term = MappedFieldType.extractTerm(query);
-        BytesRef indexedVal = term.bytes();
-        return getBackgroundFrequency(indexedVal);
+    public long getBackgroundFrequency(long termNum) throws IOException {
+        String value = config.format().format(termNum);
+        return getBackgroundFrequency(value);
    }

    @Override
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java
@ -19,8 +19,10 @@
 package org.elasticsearch.search.aggregations.bucket;

 import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
+import org.elasticsearch.action.search.SearchPhaseExecutionException;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.action.search.SearchType;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.index.query.TermQueryBuilder;
@ -47,6 +49,7 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
 import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms;
 import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
@ -69,15 +72,15 @@ public class SignificantTermsIT extends ESIntegTestCase {
                .build();
    }

-    public static final String MUSIC_CATEGORY="1";
-    public static final String OTHER_CATEGORY="2";
-    public static final String SNOWBOARDING_CATEGORY="3";
+    public static final int MUSIC_CATEGORY=1;
+    public static final int OTHER_CATEGORY=2;
+    public static final int SNOWBOARDING_CATEGORY=3;

    @Override
    public void setupSuiteScopeCluster() throws Exception {
        assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact",
                "_routing", "required=true", "routing_id", "type=keyword", "fact_category",
-                "type=keyword,index=true", "description", "type=text,fielddata=true"));
+                "type=integer", "description", "type=text,fielddata=true"));
        createIndex("idx_unmapped");

        ensureGreen();
@ -110,6 +113,15 @@ public class SignificantTermsIT extends ESIntegTestCase {
                    .setSource("fact_category", parts[1], "description", parts[2]).get();
        }
        client().admin().indices().refresh(new RefreshRequest("test")).get();
+
+        assertAcked(prepareCreate("test_not_indexed")
+                .setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                .addMapping("type",
+                        "my_keyword", "type=keyword,index=false",
+                        "my_long", "type=long,index=false"));
+        indexRandom(true,
+                client().prepareIndex("test_not_indexed", "type", "1").setSource(
+                        "my_keyword", "foo", "my_long", 42));
    }

    public void testStructuredAnalysis() throws Exception {
@ -123,12 +135,12 @@ public class SignificantTermsIT extends ESIntegTestCase {
                .actionGet();
        assertSearchResponse(response);
        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
-        String topCategory = (String) topTerms.getBuckets().iterator().next().getKey();
-        assertTrue(topCategory.equals(SNOWBOARDING_CATEGORY));
+        Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
+        assertTrue(topCategory.equals(Long.valueOf(SNOWBOARDING_CATEGORY)));
    }

    public void testStructuredAnalysisWithIncludeExclude() throws Exception {
-        String[] excludeTerms = { MUSIC_CATEGORY };
+        long[] excludeTerms = { MUSIC_CATEGORY };
        SearchResponse response = client().prepareSearch("test")
                .setSearchType(SearchType.QUERY_AND_FETCH)
                .setQuery(new TermQueryBuilder("_all", "paul"))
@ -139,8 +151,8 @@ public class SignificantTermsIT extends ESIntegTestCase {
                .actionGet();
        assertSearchResponse(response);
        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
-        String topCategory = topTerms.getBuckets().iterator().next().getKeyAsString();
-        assertTrue(topCategory.equals(OTHER_CATEGORY));
+        Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
+        assertTrue(topCategory.equals(Long.valueOf(OTHER_CATEGORY)));
    }

    public void testIncludeExclude() throws Exception {
@ -422,4 +434,18 @@ public class SignificantTermsIT extends ESIntegTestCase {
        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
        checkExpectedStringTermsFound(topTerms);
    }
+
+    public void testFailIfFieldNotIndexed() {
+        SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class,
+                () -> client().prepareSearch("test_not_indexed").addAggregation(
+                        significantTerms("mySignificantTerms").field("my_keyword")).get());
+        assertThat(e.toString(),
+                containsString("Cannot search on field [my_keyword] since it is not indexed."));
+
+        e = expectThrows(SearchPhaseExecutionException.class,
+                () -> client().prepareSearch("test_not_indexed").addAggregation(
+                        significantTerms("mySignificantTerms").field("my_long")).get());
+        assertThat(e.toString(),
+                containsString("Cannot search on field [my_long] since it is not indexed."));
+    }
 }
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java
@ -97,7 +97,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
    }

    public void testPlugin() throws Exception {
-        String type = randomBoolean() ? "text" : "keyword";
+        String type = randomBoolean() ? "text" : "long";
        String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
        SharedSignificantTermsTestMethods.index01Docs(type, settings, this);
        SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
@ -250,7 +250,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
    }

    public void testXContentResponse() throws Exception {
-        String type = randomBoolean() ? "text" : "keyword";
+        String type = false || randomBoolean() ? "text" : "long";
        String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
        SharedSignificantTermsTestMethods.index01Docs(type, settings, this);
        SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
@ -272,7 +272,12 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {

        XContentBuilder responseBuilder = XContentFactory.jsonBuilder();
        classes.toXContent(responseBuilder, null);
-        String result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
+        String result = null;
+        if (type.equals("long")) {
+            result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":0,\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":1,\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
+        } else {
+            result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
+        }
        assertThat(responseBuilder.string(), equalTo(result));

    }
@ -321,7 +326,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
    }

    public void testBackgroundVsSeparateSet() throws Exception {
-        String type = randomBoolean() ? "text" : "keyword";
+        String type = randomBoolean() ? "text" : "long";
        String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
        SharedSignificantTermsTestMethods.index01Docs(type, settings, this);
        testBackgroundVsSeparateSet(new MutualInformation(true, true), new MutualInformation(true, false));
@ -448,7 +453,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
    }

    public void testScriptScore() throws ExecutionException, InterruptedException, IOException {
-        indexRandomFrequencies01(randomBoolean() ? "text" : "keyword");
+        indexRandomFrequencies01(randomBoolean() ? "text" : "long");
        ScriptHeuristic scriptHeuristic = getScriptSignificanceHeuristic();
        ensureYellow();
        SearchResponse response = client().prepareSearch(INDEX_NAME)
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java
@ -53,7 +53,7 @@ public class TermsShardMinDocCountIT extends ESIntegTestCase {
    public void testShardMinDocCountSignificantTermsTest() throws Exception {
        String textMappings;
        if (randomBoolean()) {
-            textMappings = "type=keyword";
+            textMappings = "type=long";
        } else {
            textMappings = "type=text,fielddata=true";
        }
--- a/docs/reference/migration/migrate_5_0/aggregations.asciidoc
+++ b/docs/reference/migration/migrate_5_0/aggregations.asciidoc
@ -5,9 +5,11 @@

 Numeric fields have been refactored to use a different data structure that
 performs better for range queries. However, since this data structure does
-not record document frequencies, numeric fields can no longer be used for
-significant terms aggregations. It is recommended to use <<keyword,`keyword`>>
-fields instead, either directly or through a <<multi-fields,multi-field>>
-if the numeric representation is still needed for sorting, range queries or
-numeric aggregations like
+not record document frequencies, numeric fields need to fall back to running
+queries in order to estimate the number of matching documents in the
+background set, which may incur a performance degradation.
+
+It is recommended to use <<keyword,`keyword`>> fields instead, either directly
+or through a <<multi-fields,multi-field>> if the numeric representation is
+still needed for sorting, range queries or numeric aggregations like
 <<search-aggregations-metrics-stats-aggregation,`stats` aggregations>>.