diff --git a/core/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java b/core/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java index 971cbdafffe..ded912cf9e8 100644 --- a/core/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java +++ b/core/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java @@ -66,18 +66,12 @@ public class FilterableTermsEnum extends TermsEnum { protected long currentTotalTermFreq = 0; protected BytesRef current; protected final int docsEnumFlag; - protected int numDocs; public FilterableTermsEnum(IndexReader reader, String field, int docsEnumFlag, @Nullable Query filter) throws IOException { if ((docsEnumFlag != PostingsEnum.FREQS) && (docsEnumFlag != PostingsEnum.NONE)) { throw new IllegalArgumentException("invalid docsEnumFlag of " + docsEnumFlag); } this.docsEnumFlag = docsEnumFlag; - if (filter == null) { - // Important - need to use the doc count that includes deleted docs - // or we have this issue: https://github.com/elastic/elasticsearch/issues/7951 - numDocs = reader.maxDoc(); - } List leaves = reader.leaves(); List enums = new ArrayList<>(leaves.size()); final Weight weight; @@ -118,20 +112,12 @@ public class FilterableTermsEnum extends TermsEnum { } bits = BitSet.of(docs, context.reader().maxDoc()); - - // Count how many docs are in our filtered set - // TODO make this lazy-loaded only for those that need it? - numDocs += bits.cardinality(); } enums.add(new Holder(termsEnum, bits)); } this.enums = enums.toArray(new Holder[enums.size()]); } - public int getNumDocs() { - return numDocs; - } - @Override public BytesRef term() throws IOException { return current; diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java index 342b07b0007..2372886672e 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java @@ -91,7 +91,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri } else { size = (int) Math.min(maxBucketOrd(), bucketCountThresholds.getShardSize()); } - long supersetSize = termsAggFactory.prepareBackground(context); + long supersetSize = termsAggFactory.getSupersetNumDocs(); long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java index 6c3ce5423c8..91a585812d1 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java @@ -79,7 +79,7 @@ public class SignificantLongTermsAggregator extends LongTermsAggregator { final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); - long supersetSize = termsAggFactory.prepareBackground(context); + long supersetSize = termsAggFactory.getSupersetNumDocs(); long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index 320f5165b97..c1d4e2225ea 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -78,7 +78,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator { assert owningBucketOrdinal == 0; final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); - long supersetSize = termsAggFactory.prepareBackground(context); + long supersetSize = termsAggFactory.getSupersetNumDocs(); long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index ce3f83079ea..13126029b8e 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -19,12 +19,15 @@ package org.elasticsearch.search.aggregations.bucket.significant; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Query; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseFieldMatcher; import org.elasticsearch.common.lease.Releasable; @@ -65,7 +68,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac private MappedFieldType fieldType; private FilterableTermsEnum termsEnum; private int numberOfAggregatorsCreated; - private final QueryBuilder filterBuilder; + private final Query filter; + private final int supersetNumDocs; private final TermsAggregator.BucketCountThresholds bucketCountThresholds; private final SignificanceHeuristic significanceHeuristic; @@ -76,7 +80,15 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac super(name, type, config, context, parent, subFactoriesBuilder, metaData); this.includeExclude = includeExclude; this.executionHint = executionHint; - this.filterBuilder = filterBuilder; + this.filter = filterBuilder == null + ? null + : filterBuilder.toQuery(context.searchContext().getQueryShardContext()); + IndexSearcher searcher = context.searchContext().searcher(); + this.supersetNumDocs = filter == null + // Important - need to use the doc count that includes deleted docs + // or we have this issue: https://github.com/elastic/elasticsearch/issues/7951 + ? searcher.getIndexReader().maxDoc() + : searcher.count(filter); this.bucketCountThresholds = bucketCountThresholds; this.significanceHeuristic = significanceHeuristic; this.significanceHeuristic.initialize(context.searchContext()); @@ -92,65 +104,56 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac } /** - * Creates the TermsEnum (if not already created) and must be called before - * any calls to getBackgroundFrequency - * - * @param context - * The aggregation context - * @return The number of documents in the index (after an optional filter - * might have been applied) + * Get the number of docs in the superset. */ - public long prepareBackground(AggregationContext context) { + public long getSupersetNumDocs() { + return supersetNumDocs; + } + + private FilterableTermsEnum getTermsEnum(String field) throws IOException { if (termsEnum != null) { - // already prepared - return - return termsEnum.getNumDocs(); + return termsEnum; } - SearchContext searchContext = context.searchContext(); - IndexReader reader = searchContext.searcher().getIndexReader(); - Query filter = null; - try { - if (filterBuilder != null) { - filter = filterBuilder.toFilter(context.searchContext().getQueryShardContext()); - } - } catch (IOException e) { - throw new ElasticsearchException("failed to create filter: " + filterBuilder.toString(), e); + IndexReader reader = context.searchContext().searcher().getIndexReader(); + if (numberOfAggregatorsCreated > 1) { + termsEnum = new FreqTermsEnum(reader, field, true, false, filter, context.bigArrays()); + } else { + termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter); } - try { - if (numberOfAggregatorsCreated == 1) { - // Setup a termsEnum for sole use by one aggregator - termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter); + return termsEnum; + } + + private long getBackgroundFrequency(String value) throws IOException { + Query query = fieldType.termQuery(value, context.searchContext().getQueryShardContext()); + if (query instanceof TermQuery) { + // for types that use the inverted index, we prefer using a caching terms + // enum that will do a better job at reusing index inputs + Term term = ((TermQuery) query).getTerm(); + FilterableTermsEnum termsEnum = getTermsEnum(term.field()); + if (termsEnum.seekExact(term.bytes())) { + return termsEnum.docFreq(); } else { - // When we have > 1 agg we have possibility of duplicate term - // frequency lookups - // and so use a TermsEnum that caches results of all term - // lookups - termsEnum = new FreqTermsEnum(reader, indexedFieldName, true, false, filter, searchContext.bigArrays()); + return 0; } - } catch (IOException e) { - throw new ElasticsearchException("failed to build terms enumeration", e); } - return termsEnum.getNumDocs(); + // otherwise do it the naive way + if (filter != null) { + query = new BooleanQuery.Builder() + .add(query, Occur.FILTER) + .add(filter, Occur.FILTER) + .build(); + } + return context.searchContext().searcher().count(query); } - public long getBackgroundFrequency(BytesRef termBytes) { - assert termsEnum != null; // having failed to find a field in the index - // we don't expect any calls for frequencies - long result = 0; - try { - if (termsEnum.seekExact(termBytes)) { - result = termsEnum.docFreq(); - } - } catch (IOException e) { - throw new ElasticsearchException("IOException loading background document frequency info", e); - } - return result; + public long getBackgroundFrequency(BytesRef termBytes) throws IOException { + String value = config.format().format(termBytes); + return getBackgroundFrequency(value); } - public long getBackgroundFrequency(long value) { - Query query = fieldType.termQuery(value, null); - Term term = MappedFieldType.extractTerm(query); - BytesRef indexedVal = term.bytes(); - return getBackgroundFrequency(indexedVal); + public long getBackgroundFrequency(long termNum) throws IOException { + String value = config.format().format(termNum); + return getBackgroundFrequency(value); } @Override diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java index f3ca8aa0b7b..587e44b21b2 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java @@ -19,8 +19,10 @@ package org.elasticsearch.search.aggregations.bucket; import org.elasticsearch.action.admin.indices.refresh.RefreshRequest; +import org.elasticsearch.action.search.SearchPhaseExecutionException; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.TermQueryBuilder; @@ -47,6 +49,7 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms; import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; +import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; @@ -69,15 +72,15 @@ public class SignificantTermsIT extends ESIntegTestCase { .build(); } - public static final String MUSIC_CATEGORY="1"; - public static final String OTHER_CATEGORY="2"; - public static final String SNOWBOARDING_CATEGORY="3"; + public static final int MUSIC_CATEGORY=1; + public static final int OTHER_CATEGORY=2; + public static final int SNOWBOARDING_CATEGORY=3; @Override public void setupSuiteScopeCluster() throws Exception { assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact", "_routing", "required=true", "routing_id", "type=keyword", "fact_category", - "type=keyword,index=true", "description", "type=text,fielddata=true")); + "type=integer", "description", "type=text,fielddata=true")); createIndex("idx_unmapped"); ensureGreen(); @@ -110,6 +113,15 @@ public class SignificantTermsIT extends ESIntegTestCase { .setSource("fact_category", parts[1], "description", parts[2]).get(); } client().admin().indices().refresh(new RefreshRequest("test")).get(); + + assertAcked(prepareCreate("test_not_indexed") + .setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) + .addMapping("type", + "my_keyword", "type=keyword,index=false", + "my_long", "type=long,index=false")); + indexRandom(true, + client().prepareIndex("test_not_indexed", "type", "1").setSource( + "my_keyword", "foo", "my_long", 42)); } public void testStructuredAnalysis() throws Exception { @@ -123,12 +135,12 @@ public class SignificantTermsIT extends ESIntegTestCase { .actionGet(); assertSearchResponse(response); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - String topCategory = (String) topTerms.getBuckets().iterator().next().getKey(); - assertTrue(topCategory.equals(SNOWBOARDING_CATEGORY)); + Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey(); + assertTrue(topCategory.equals(Long.valueOf(SNOWBOARDING_CATEGORY))); } public void testStructuredAnalysisWithIncludeExclude() throws Exception { - String[] excludeTerms = { MUSIC_CATEGORY }; + long[] excludeTerms = { MUSIC_CATEGORY }; SearchResponse response = client().prepareSearch("test") .setSearchType(SearchType.QUERY_AND_FETCH) .setQuery(new TermQueryBuilder("_all", "paul")) @@ -139,8 +151,8 @@ public class SignificantTermsIT extends ESIntegTestCase { .actionGet(); assertSearchResponse(response); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - String topCategory = topTerms.getBuckets().iterator().next().getKeyAsString(); - assertTrue(topCategory.equals(OTHER_CATEGORY)); + Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey(); + assertTrue(topCategory.equals(Long.valueOf(OTHER_CATEGORY))); } public void testIncludeExclude() throws Exception { @@ -422,4 +434,18 @@ public class SignificantTermsIT extends ESIntegTestCase { SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); checkExpectedStringTermsFound(topTerms); } + + public void testFailIfFieldNotIndexed() { + SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class, + () -> client().prepareSearch("test_not_indexed").addAggregation( + significantTerms("mySignificantTerms").field("my_keyword")).get()); + assertThat(e.toString(), + containsString("Cannot search on field [my_keyword] since it is not indexed.")); + + e = expectThrows(SearchPhaseExecutionException.class, + () -> client().prepareSearch("test_not_indexed").addAggregation( + significantTerms("mySignificantTerms").field("my_long")).get()); + assertThat(e.toString(), + containsString("Cannot search on field [my_long] since it is not indexed.")); + } } diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java index 67e6d209ad5..0467992002f 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java @@ -97,7 +97,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase { } public void testPlugin() throws Exception { - String type = randomBoolean() ? "text" : "keyword"; + String type = randomBoolean() ? "text" : "long"; String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}"; SharedSignificantTermsTestMethods.index01Docs(type, settings, this); SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE) @@ -250,7 +250,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase { } public void testXContentResponse() throws Exception { - String type = randomBoolean() ? "text" : "keyword"; + String type = false || randomBoolean() ? "text" : "long"; String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}"; SharedSignificantTermsTestMethods.index01Docs(type, settings, this); SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE) @@ -272,7 +272,12 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase { XContentBuilder responseBuilder = XContentFactory.jsonBuilder(); classes.toXContent(responseBuilder, null); - String result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; + String result = null; + if (type.equals("long")) { + result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":0,\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":1,\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; + } else { + result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; + } assertThat(responseBuilder.string(), equalTo(result)); } @@ -321,7 +326,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase { } public void testBackgroundVsSeparateSet() throws Exception { - String type = randomBoolean() ? "text" : "keyword"; + String type = randomBoolean() ? "text" : "long"; String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}"; SharedSignificantTermsTestMethods.index01Docs(type, settings, this); testBackgroundVsSeparateSet(new MutualInformation(true, true), new MutualInformation(true, false)); @@ -448,7 +453,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase { } public void testScriptScore() throws ExecutionException, InterruptedException, IOException { - indexRandomFrequencies01(randomBoolean() ? "text" : "keyword"); + indexRandomFrequencies01(randomBoolean() ? "text" : "long"); ScriptHeuristic scriptHeuristic = getScriptSignificanceHeuristic(); ensureYellow(); SearchResponse response = client().prepareSearch(INDEX_NAME) diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java index 408a9137eb8..dad55a20828 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/TermsShardMinDocCountIT.java @@ -53,7 +53,7 @@ public class TermsShardMinDocCountIT extends ESIntegTestCase { public void testShardMinDocCountSignificantTermsTest() throws Exception { String textMappings; if (randomBoolean()) { - textMappings = "type=keyword"; + textMappings = "type=long"; } else { textMappings = "type=text,fielddata=true"; } diff --git a/docs/reference/migration/migrate_5_0/aggregations.asciidoc b/docs/reference/migration/migrate_5_0/aggregations.asciidoc index 32a8bbd35d2..60c1060f159 100644 --- a/docs/reference/migration/migrate_5_0/aggregations.asciidoc +++ b/docs/reference/migration/migrate_5_0/aggregations.asciidoc @@ -5,9 +5,11 @@ Numeric fields have been refactored to use a different data structure that performs better for range queries. However, since this data structure does -not record document frequencies, numeric fields can no longer be used for -significant terms aggregations. It is recommended to use <> -fields instead, either directly or through a <> -if the numeric representation is still needed for sorting, range queries or -numeric aggregations like +not record document frequencies, numeric fields need to fall back to running +queries in order to estimate the number of matching documents in the +background set, which may incur a performance degradation. + +It is recommended to use <> fields instead, either directly +or through a <> if the numeric representation is +still needed for sorting, range queries or numeric aggregations like <>.