Significant_terms agg: added option for a background_filter to define background context for analysis of term frequencies
Closes #5944
This commit is contained in:
parent
3484ca3737
commit
1e560b0d92
|
@ -233,13 +233,19 @@ free-text field and use them in a `terms` query on the same field with a `highli
|
|||
are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent.
|
||||
============
|
||||
|
||||
==== Limitations
|
||||
==== Custom background sets
|
||||
added[1.2.0]
|
||||
|
||||
===== Single _background_ comparison base
|
||||
The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying
|
||||
a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different
|
||||
background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look
|
||||
for significant terms in a subset of that content which is from this week.
|
||||
|
||||
Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index.
|
||||
However, sometimes it may prove useful to use a narrower background set as the basis for comparisons.
|
||||
For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish"
|
||||
was a significant term. This may be true but if you want some more focused terms you could use a `background_filter`
|
||||
on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now
|
||||
be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.
|
||||
Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.
|
||||
|
||||
==== Limitations
|
||||
|
||||
===== Significant terms must be indexed values
|
||||
Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes.
|
||||
|
@ -337,6 +343,37 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to
|
|||
|
||||
|
||||
|
||||
===== Custom background context
|
||||
|
||||
The default source of statistical information for background term frequencies is the entire index and this
|
||||
scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower
|
||||
context:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"query" : {
|
||||
"match" : "madrid"
|
||||
},
|
||||
"aggs" : {
|
||||
"tags" : {
|
||||
"significant_terms" : {
|
||||
"field" : "tag",
|
||||
"background_filter": {
|
||||
"term" : { "text" : "spain"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing
|
||||
terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the
|
||||
word "Spain".
|
||||
|
||||
WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency
|
||||
|
||||
|
||||
===== Filtering Values
|
||||
|
||||
|
|
|
@ -97,7 +97,6 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
|
|||
spare.subsetSize = subsetSize;
|
||||
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
|
||||
spare.supersetSize = supersetSize;
|
||||
assert spare.subsetDf <= spare.supersetDf;
|
||||
// During shard-local down-selection we use subset/superset stats
|
||||
// that are for this shard only
|
||||
// Back at the central reducer these properties will be updated with
|
||||
|
|
|
@ -54,7 +54,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
|
|||
protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) {
|
||||
super(subsetDf, subsetSize, supersetDf, supersetSize);
|
||||
this.aggregations = aggregations;
|
||||
assert subsetDf <= supersetDf;
|
||||
updateScore();
|
||||
}
|
||||
|
||||
|
@ -96,7 +95,12 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
|
|||
// avoid any divide by zero issues
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (supersetFreq == 0) {
|
||||
// If we are using a background context that is not a strict superset, a foreground
|
||||
// term may be missing from the background, so for the purposes of this calculation
|
||||
// we assume a value of 1 for our calculations which avoids returning an "infinity" result
|
||||
supersetFreq = 1;
|
||||
}
|
||||
double subsetProbability = (double) subsetFreq / (double) subsetSize;
|
||||
double supersetProbability = (double) supersetFreq / (double) supersetSize;
|
||||
|
||||
|
@ -154,7 +158,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
|
|||
}
|
||||
aggregationsList.add(bucket.aggregations);
|
||||
}
|
||||
assert reduced.subsetDf <= reduced.supersetDf;
|
||||
reduced.aggregations = InternalAggregations.reduce(aggregationsList, bigArrays);
|
||||
return reduced;
|
||||
}
|
||||
|
|
|
@ -84,7 +84,6 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
|
|||
spare.subsetSize = subsetSize;
|
||||
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
|
||||
spare.supersetSize = supersetSize;
|
||||
assert spare.subsetDf <= spare.supersetDf;
|
||||
// During shard-local down-selection we use subset/superset stats
|
||||
// that are for this shard only
|
||||
// Back at the central reducer these properties will be updated with
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.search.aggregations.bucket.significant;
|
||||
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.query.FilterBuilder;
|
||||
import org.elasticsearch.search.aggregations.AggregationBuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -42,6 +43,8 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
|
|||
private int includeFlags;
|
||||
private String excludePattern;
|
||||
private int excludeFlags;
|
||||
private FilterBuilder filterBuilder;
|
||||
|
||||
|
||||
public SignificantTermsBuilder(String name) {
|
||||
super(name, SignificantStringTerms.TYPE.name());
|
||||
|
@ -66,6 +69,12 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
|
|||
this.minDocCount = minDocCount;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SignificantTermsBuilder backgroundFilter(FilterBuilder filter) {
|
||||
this.filterBuilder = filter;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public SignificantTermsBuilder shardMinDocCount(int shardMinDocCount) {
|
||||
this.shardMinDocCount = shardMinDocCount;
|
||||
|
@ -162,6 +171,11 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
|
|||
.endObject();
|
||||
}
|
||||
}
|
||||
|
||||
if (filterBuilder != null) {
|
||||
builder.field(SignificantTermsParser.BACKGROUND_FILTER.getPreferredName());
|
||||
filterBuilder.toXContent(builder, params);
|
||||
}
|
||||
|
||||
return builder.endObject();
|
||||
}
|
||||
|
|
|
@ -41,6 +41,8 @@ public class SignificantTermsParser implements Aggregator.Parser {
|
|||
|
||||
//Typically need more than one occurrence of something for it to be statistically significant
|
||||
public static final int DEFAULT_MIN_DOC_COUNT = 3;
|
||||
|
||||
static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
|
||||
|
||||
static final ParseField SHARD_MIN_DOC_COUNT_FIELD_NAME = new ParseField("shard_min_doc_count");
|
||||
public static final int DEFAULT_SHARD_MIN_DOC_COUNT = 1;
|
||||
|
@ -99,18 +101,11 @@ public class SignificantTermsParser implements Aggregator.Parser {
|
|||
|
||||
}
|
||||
} else if (token == XContentParser.Token.START_OBJECT) {
|
||||
// TODO not sure if code below is the best means to declare a filter for
|
||||
// defining an alternative background stats context.
|
||||
// In trial runs it becomes obvious that the choice of background does have to
|
||||
// be a strict superset of the foreground subset otherwise the significant terms algo
|
||||
// immediately singles out the odd terms that are in the foreground but not represented
|
||||
// in the background. So a better approach may be to use a designated parent agg as the
|
||||
// background because parent aggs are always guaranteed to be a superset whereas arbitrary
|
||||
// filters defined by end users and parsed below are not.
|
||||
// if ("background_context".equals(currentFieldName)) {
|
||||
// filter = context.queryParserService().parseInnerFilter(parser).filter();
|
||||
// }
|
||||
|
||||
if (BACKGROUND_FILTER.match(currentFieldName)) {
|
||||
filter = context.queryParserService().parseInnerFilter(parser).filter();
|
||||
} else {
|
||||
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
|
||||
}
|
||||
} else {
|
||||
throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.elasticsearch.action.search.SearchResponse;
|
|||
import org.elasticsearch.action.search.SearchType;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.query.FilterBuilders;
|
||||
import org.elasticsearch.index.query.TermQueryBuilder;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
|
||||
|
@ -183,8 +184,59 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
|
|||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void badFilteredAnalysis() throws Exception {
|
||||
// Deliberately using a bad choice of filter here for the background context in order
|
||||
// to test robustness.
|
||||
// We search for the name of a snowboarder but use music-related content (fact_category:1)
|
||||
// as the background source of term statistics.
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_AND_FETCH)
|
||||
.setQuery(new TermQueryBuilder("_all", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
|
||||
.minDocCount(2).backgroundFilter(FilterBuilders.termFilter("fact_category", 1)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
// We expect at least one of the significant terms to have been selected on the basis
|
||||
// that it is present in the foreground selection but entirely missing from the filtered
|
||||
// background used as context.
|
||||
boolean hasMissingBackgroundTerms = false;
|
||||
for (Bucket topTerm : topTerms) {
|
||||
if (topTerm.getSupersetDf() == 0) {
|
||||
hasMissingBackgroundTerms = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(hasMissingBackgroundTerms);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void filteredAnalysis() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_AND_FETCH)
|
||||
.setQuery(new TermQueryBuilder("_all", "weller"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
|
||||
.minDocCount(1).backgroundFilter(FilterBuilders.termsFilter("description", "paul")))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
HashSet<String> topWords = new HashSet<String>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
topWords.add(topTerm.getKey());
|
||||
}
|
||||
//The word "paul" should be a constant of all docs in the background set and therefore not seen as significant
|
||||
assertFalse(topWords.contains("paul"));
|
||||
//"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls.
|
||||
assertTrue(topWords.contains("jam"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nestedAggs() throws Exception {
|
||||
String[][] expectedKeywordsByCategory={
|
||||
|
|
Loading…
Reference in New Issue