Make significant terms work on fields that are indexed with points. #18031

It will keep using the caching terms enum for keyword/text fields and falls back
to IndexSearcher.count for fields that do not use the inverted index for
searching (such as numbers and ip addresses). Note that this probably means that
significant terms aggregations on these fields will be less efficient than they
used to be. It should be ok under a sampler aggregation though.

This moves tests back to the state they were in before numbers started using
points, and also adds a new test that significant terms aggs fail if a field is
not indexed.

In the long term, we might want to follow the approach that Robert initially
proposed that consists in collecting all documents from the background filter in
order to compute frequencies using doc values. This would also mean that
significant terms aggregations do not require fields to be indexed anymore.
This commit is contained in:
Adrien Grand 2016-04-27 20:59:28 +02:00
parent e4edee5d9a
commit 866a5459f0
9 changed files with 111 additions and 89 deletions

View File

@ -66,18 +66,12 @@ public class FilterableTermsEnum extends TermsEnum {
protected long currentTotalTermFreq = 0;
protected BytesRef current;
protected final int docsEnumFlag;
protected int numDocs;
public FilterableTermsEnum(IndexReader reader, String field, int docsEnumFlag, @Nullable Query filter) throws IOException {
if ((docsEnumFlag != PostingsEnum.FREQS) && (docsEnumFlag != PostingsEnum.NONE)) {
throw new IllegalArgumentException("invalid docsEnumFlag of " + docsEnumFlag);
}
this.docsEnumFlag = docsEnumFlag;
if (filter == null) {
// Important - need to use the doc count that includes deleted docs
// or we have this issue: https://github.com/elastic/elasticsearch/issues/7951
numDocs = reader.maxDoc();
}
List<LeafReaderContext> leaves = reader.leaves();
List<Holder> enums = new ArrayList<>(leaves.size());
final Weight weight;
@ -118,20 +112,12 @@ public class FilterableTermsEnum extends TermsEnum {
}
bits = BitSet.of(docs, context.reader().maxDoc());
// Count how many docs are in our filtered set
// TODO make this lazy-loaded only for those that need it?
numDocs += bits.cardinality();
}
enums.add(new Holder(termsEnum, bits));
}
this.enums = enums.toArray(new Holder[enums.size()]);
}
public int getNumDocs() {
return numDocs;
}
@Override
public BytesRef term() throws IOException {
return current;

View File

@ -91,7 +91,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
} else {
size = (int) Math.min(maxBucketOrd(), bucketCountThresholds.getShardSize());
}
long supersetSize = termsAggFactory.prepareBackground(context);
long supersetSize = termsAggFactory.getSupersetNumDocs();
long subsetSize = numCollectedDocs;
BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);

View File

@ -79,7 +79,7 @@ public class SignificantLongTermsAggregator extends LongTermsAggregator {
final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize());
long supersetSize = termsAggFactory.prepareBackground(context);
long supersetSize = termsAggFactory.getSupersetNumDocs();
long subsetSize = numCollectedDocs;
BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);

View File

@ -78,7 +78,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
assert owningBucketOrdinal == 0;
final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize());
long supersetSize = termsAggFactory.prepareBackground(context);
long supersetSize = termsAggFactory.getSupersetNumDocs();
long subsetSize = numCollectedDocs;
BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);

View File

@ -19,12 +19,15 @@
package org.elasticsearch.search.aggregations.bucket.significant;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParseFieldMatcher;
import org.elasticsearch.common.lease.Releasable;
@ -65,7 +68,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
private MappedFieldType fieldType;
private FilterableTermsEnum termsEnum;
private int numberOfAggregatorsCreated;
private final QueryBuilder filterBuilder;
private final Query filter;
private final int supersetNumDocs;
private final TermsAggregator.BucketCountThresholds bucketCountThresholds;
private final SignificanceHeuristic significanceHeuristic;
@ -76,7 +80,15 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
super(name, type, config, context, parent, subFactoriesBuilder, metaData);
this.includeExclude = includeExclude;
this.executionHint = executionHint;
this.filterBuilder = filterBuilder;
this.filter = filterBuilder == null
? null
: filterBuilder.toQuery(context.searchContext().getQueryShardContext());
IndexSearcher searcher = context.searchContext().searcher();
this.supersetNumDocs = filter == null
// Important - need to use the doc count that includes deleted docs
// or we have this issue: https://github.com/elastic/elasticsearch/issues/7951
? searcher.getIndexReader().maxDoc()
: searcher.count(filter);
this.bucketCountThresholds = bucketCountThresholds;
this.significanceHeuristic = significanceHeuristic;
this.significanceHeuristic.initialize(context.searchContext());
@ -92,65 +104,56 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
}
/**
* Creates the TermsEnum (if not already created) and must be called before
* any calls to getBackgroundFrequency
*
* @param context
* The aggregation context
* @return The number of documents in the index (after an optional filter
* might have been applied)
* Get the number of docs in the superset.
*/
public long prepareBackground(AggregationContext context) {
public long getSupersetNumDocs() {
return supersetNumDocs;
}
private FilterableTermsEnum getTermsEnum(String field) throws IOException {
if (termsEnum != null) {
// already prepared - return
return termsEnum.getNumDocs();
return termsEnum;
}
SearchContext searchContext = context.searchContext();
IndexReader reader = searchContext.searcher().getIndexReader();
Query filter = null;
try {
if (filterBuilder != null) {
filter = filterBuilder.toFilter(context.searchContext().getQueryShardContext());
}
} catch (IOException e) {
throw new ElasticsearchException("failed to create filter: " + filterBuilder.toString(), e);
IndexReader reader = context.searchContext().searcher().getIndexReader();
if (numberOfAggregatorsCreated > 1) {
termsEnum = new FreqTermsEnum(reader, field, true, false, filter, context.bigArrays());
} else {
termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
}
try {
if (numberOfAggregatorsCreated == 1) {
// Setup a termsEnum for sole use by one aggregator
termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
return termsEnum;
}
private long getBackgroundFrequency(String value) throws IOException {
Query query = fieldType.termQuery(value, context.searchContext().getQueryShardContext());
if (query instanceof TermQuery) {
// for types that use the inverted index, we prefer using a caching terms
// enum that will do a better job at reusing index inputs
Term term = ((TermQuery) query).getTerm();
FilterableTermsEnum termsEnum = getTermsEnum(term.field());
if (termsEnum.seekExact(term.bytes())) {
return termsEnum.docFreq();
} else {
// When we have > 1 agg we have possibility of duplicate term
// frequency lookups
// and so use a TermsEnum that caches results of all term
// lookups
termsEnum = new FreqTermsEnum(reader, indexedFieldName, true, false, filter, searchContext.bigArrays());
return 0;
}
} catch (IOException e) {
throw new ElasticsearchException("failed to build terms enumeration", e);
}
return termsEnum.getNumDocs();
// otherwise do it the naive way
if (filter != null) {
query = new BooleanQuery.Builder()
.add(query, Occur.FILTER)
.add(filter, Occur.FILTER)
.build();
}
return context.searchContext().searcher().count(query);
}
public long getBackgroundFrequency(BytesRef termBytes) {
assert termsEnum != null; // having failed to find a field in the index
// we don't expect any calls for frequencies
long result = 0;
try {
if (termsEnum.seekExact(termBytes)) {
result = termsEnum.docFreq();
}
} catch (IOException e) {
throw new ElasticsearchException("IOException loading background document frequency info", e);
}
return result;
public long getBackgroundFrequency(BytesRef termBytes) throws IOException {
String value = config.format().format(termBytes);
return getBackgroundFrequency(value);
}
public long getBackgroundFrequency(long value) {
Query query = fieldType.termQuery(value, null);
Term term = MappedFieldType.extractTerm(query);
BytesRef indexedVal = term.bytes();
return getBackgroundFrequency(indexedVal);
public long getBackgroundFrequency(long termNum) throws IOException {
String value = config.format().format(termNum);
return getBackgroundFrequency(value);
}
@Override

View File

@ -19,8 +19,10 @@
package org.elasticsearch.search.aggregations.bucket;
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.search.SearchPhaseExecutionException;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
@ -47,6 +49,7 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
@ -69,15 +72,15 @@ public class SignificantTermsIT extends ESIntegTestCase {
.build();
}
public static final String MUSIC_CATEGORY="1";
public static final String OTHER_CATEGORY="2";
public static final String SNOWBOARDING_CATEGORY="3";
public static final int MUSIC_CATEGORY=1;
public static final int OTHER_CATEGORY=2;
public static final int SNOWBOARDING_CATEGORY=3;
@Override
public void setupSuiteScopeCluster() throws Exception {
assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact",
"_routing", "required=true", "routing_id", "type=keyword", "fact_category",
"type=keyword,index=true", "description", "type=text,fielddata=true"));
"type=integer", "description", "type=text,fielddata=true"));
createIndex("idx_unmapped");
ensureGreen();
@ -110,6 +113,15 @@ public class SignificantTermsIT extends ESIntegTestCase {
.setSource("fact_category", parts[1], "description", parts[2]).get();
}
client().admin().indices().refresh(new RefreshRequest("test")).get();
assertAcked(prepareCreate("test_not_indexed")
.setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.addMapping("type",
"my_keyword", "type=keyword,index=false",
"my_long", "type=long,index=false"));
indexRandom(true,
client().prepareIndex("test_not_indexed", "type", "1").setSource(
"my_keyword", "foo", "my_long", 42));
}
public void testStructuredAnalysis() throws Exception {
@ -123,12 +135,12 @@ public class SignificantTermsIT extends ESIntegTestCase {
.actionGet();
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
String topCategory = (String) topTerms.getBuckets().iterator().next().getKey();
assertTrue(topCategory.equals(SNOWBOARDING_CATEGORY));
Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
assertTrue(topCategory.equals(Long.valueOf(SNOWBOARDING_CATEGORY)));
}
public void testStructuredAnalysisWithIncludeExclude() throws Exception {
String[] excludeTerms = { MUSIC_CATEGORY };
long[] excludeTerms = { MUSIC_CATEGORY };
SearchResponse response = client().prepareSearch("test")
.setSearchType(SearchType.QUERY_AND_FETCH)
.setQuery(new TermQueryBuilder("_all", "paul"))
@ -139,8 +151,8 @@ public class SignificantTermsIT extends ESIntegTestCase {
.actionGet();
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
String topCategory = topTerms.getBuckets().iterator().next().getKeyAsString();
assertTrue(topCategory.equals(OTHER_CATEGORY));
Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
assertTrue(topCategory.equals(Long.valueOf(OTHER_CATEGORY)));
}
public void testIncludeExclude() throws Exception {
@ -422,4 +434,18 @@ public class SignificantTermsIT extends ESIntegTestCase {
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
checkExpectedStringTermsFound(topTerms);
}
public void testFailIfFieldNotIndexed() {
SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class,
() -> client().prepareSearch("test_not_indexed").addAggregation(
significantTerms("mySignificantTerms").field("my_keyword")).get());
assertThat(e.toString(),
containsString("Cannot search on field [my_keyword] since it is not indexed."));
e = expectThrows(SearchPhaseExecutionException.class,
() -> client().prepareSearch("test_not_indexed").addAggregation(
significantTerms("mySignificantTerms").field("my_long")).get());
assertThat(e.toString(),
containsString("Cannot search on field [my_long] since it is not indexed."));
}
}

View File

@ -97,7 +97,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
}
public void testPlugin() throws Exception {
String type = randomBoolean() ? "text" : "keyword";
String type = randomBoolean() ? "text" : "long";
String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
SharedSignificantTermsTestMethods.index01Docs(type, settings, this);
SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
@ -250,7 +250,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
}
public void testXContentResponse() throws Exception {
String type = randomBoolean() ? "text" : "keyword";
String type = false || randomBoolean() ? "text" : "long";
String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
SharedSignificantTermsTestMethods.index01Docs(type, settings, this);
SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
@ -272,7 +272,12 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
XContentBuilder responseBuilder = XContentFactory.jsonBuilder();
classes.toXContent(responseBuilder, null);
String result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
String result = null;
if (type.equals("long")) {
result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":0,\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":1,\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
} else {
result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
}
assertThat(responseBuilder.string(), equalTo(result));
}
@ -321,7 +326,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
}
public void testBackgroundVsSeparateSet() throws Exception {
String type = randomBoolean() ? "text" : "keyword";
String type = randomBoolean() ? "text" : "long";
String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
SharedSignificantTermsTestMethods.index01Docs(type, settings, this);
testBackgroundVsSeparateSet(new MutualInformation(true, true), new MutualInformation(true, false));
@ -448,7 +453,7 @@ public class SignificantTermsSignificanceScoreIT extends ESIntegTestCase {
}
public void testScriptScore() throws ExecutionException, InterruptedException, IOException {
indexRandomFrequencies01(randomBoolean() ? "text" : "keyword");
indexRandomFrequencies01(randomBoolean() ? "text" : "long");
ScriptHeuristic scriptHeuristic = getScriptSignificanceHeuristic();
ensureYellow();
SearchResponse response = client().prepareSearch(INDEX_NAME)

View File

@ -53,7 +53,7 @@ public class TermsShardMinDocCountIT extends ESIntegTestCase {
public void testShardMinDocCountSignificantTermsTest() throws Exception {
String textMappings;
if (randomBoolean()) {
textMappings = "type=keyword";
textMappings = "type=long";
} else {
textMappings = "type=text,fielddata=true";
}

View File

@ -5,9 +5,11 @@
Numeric fields have been refactored to use a different data structure that
performs better for range queries. However, since this data structure does
not record document frequencies, numeric fields can no longer be used for
significant terms aggregations. It is recommended to use <<keyword,`keyword`>>
fields instead, either directly or through a <<multi-fields,multi-field>>
if the numeric representation is still needed for sorting, range queries or
numeric aggregations like
not record document frequencies, numeric fields need to fall back to running
queries in order to estimate the number of matching documents in the
background set, which may incur a performance degradation.
It is recommended to use <<keyword,`keyword`>> fields instead, either directly
or through a <<multi-fields,multi-field>> if the numeric representation is
still needed for sorting, range queries or numeric aggregations like
<<search-aggregations-metrics-stats-aggregation,`stats` aggregations>>.