Significant_terms agg: added option for a background_filter to define background context for analysis of term frequencies

Closes #5944
This commit is contained in:
markharwood 2014-04-02 13:58:15 +01:00
parent 3484ca3737
commit 1e560b0d92
7 changed files with 123 additions and 24 deletions

View File

@ -233,13 +233,19 @@ free-text field and use them in a `terms` query on the same field with a `highli
are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent.
============
==== Limitations
==== Custom background sets
added[1.2.0]
===== Single _background_ comparison base
The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying
a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different
background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look
for significant terms in a subset of that content which is from this week.
Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index.
However, sometimes it may prove useful to use a narrower background set as the basis for comparisons.
For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish"
was a significant term. This may be true but if you want some more focused terms you could use a `background_filter`
on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now
be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.
Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.
==== Limitations
===== Significant terms must be indexed values
Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes.
@ -337,6 +343,37 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to
===== Custom background context
The default source of statistical information for background term frequencies is the entire index and this
scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower
context:
[source,js]
--------------------------------------------------
{
"query" : {
"match" : "madrid"
},
"aggs" : {
"tags" : {
"significant_terms" : {
"field" : "tag",
"background_filter": {
"term" : { "text" : "spain"}
}
}
}
}
}
--------------------------------------------------
The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing
terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the
word "Spain".
WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency
===== Filtering Values

View File

@ -97,7 +97,6 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
spare.subsetSize = subsetSize;
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
spare.supersetSize = supersetSize;
assert spare.subsetDf <= spare.supersetDf;
// During shard-local down-selection we use subset/superset stats
// that are for this shard only
// Back at the central reducer these properties will be updated with

View File

@ -54,7 +54,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) {
super(subsetDf, subsetSize, supersetDf, supersetSize);
this.aggregations = aggregations;
assert subsetDf <= supersetDf;
updateScore();
}
@ -96,7 +95,12 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
// avoid any divide by zero issues
return 0;
}
if (supersetFreq == 0) {
// If we are using a background context that is not a strict superset, a foreground
// term may be missing from the background, so for the purposes of this calculation
// we assume a value of 1 for our calculations which avoids returning an "infinity" result
supersetFreq = 1;
}
double subsetProbability = (double) subsetFreq / (double) subsetSize;
double supersetProbability = (double) supersetFreq / (double) supersetSize;
@ -154,7 +158,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
}
aggregationsList.add(bucket.aggregations);
}
assert reduced.subsetDf <= reduced.supersetDf;
reduced.aggregations = InternalAggregations.reduce(aggregationsList, bigArrays);
return reduced;
}

View File

@ -84,7 +84,6 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
spare.subsetSize = subsetSize;
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
spare.supersetSize = supersetSize;
assert spare.subsetDf <= spare.supersetDf;
// During shard-local down-selection we use subset/superset stats
// that are for this shard only
// Back at the central reducer these properties will be updated with

View File

@ -20,6 +20,7 @@
package org.elasticsearch.search.aggregations.bucket.significant;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import java.io.IOException;
@ -42,6 +43,8 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
private int includeFlags;
private String excludePattern;
private int excludeFlags;
private FilterBuilder filterBuilder;
public SignificantTermsBuilder(String name) {
super(name, SignificantStringTerms.TYPE.name());
@ -66,6 +69,12 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
this.minDocCount = minDocCount;
return this;
}
public SignificantTermsBuilder backgroundFilter(FilterBuilder filter) {
this.filterBuilder = filter;
return this;
}
public SignificantTermsBuilder shardMinDocCount(int shardMinDocCount) {
this.shardMinDocCount = shardMinDocCount;
@ -162,6 +171,11 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
.endObject();
}
}
if (filterBuilder != null) {
builder.field(SignificantTermsParser.BACKGROUND_FILTER.getPreferredName());
filterBuilder.toXContent(builder, params);
}
return builder.endObject();
}

View File

@ -41,6 +41,8 @@ public class SignificantTermsParser implements Aggregator.Parser {
//Typically need more than one occurrence of something for it to be statistically significant
public static final int DEFAULT_MIN_DOC_COUNT = 3;
static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
static final ParseField SHARD_MIN_DOC_COUNT_FIELD_NAME = new ParseField("shard_min_doc_count");
public static final int DEFAULT_SHARD_MIN_DOC_COUNT = 1;
@ -99,18 +101,11 @@ public class SignificantTermsParser implements Aggregator.Parser {
}
} else if (token == XContentParser.Token.START_OBJECT) {
// TODO not sure if code below is the best means to declare a filter for
// defining an alternative background stats context.
// In trial runs it becomes obvious that the choice of background does have to
// be a strict superset of the foreground subset otherwise the significant terms algo
// immediately singles out the odd terms that are in the foreground but not represented
// in the background. So a better approach may be to use a designated parent agg as the
// background because parent aggs are always guaranteed to be a superset whereas arbitrary
// filters defined by end users and parsed below are not.
// if ("background_context".equals(currentFieldName)) {
// filter = context.queryParserService().parseInnerFilter(parser).filter();
// }
if (BACKGROUND_FILTER.match(currentFieldName)) {
filter = context.queryParserService().parseInnerFilter(parser).filter();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
}
} else {
throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
}

View File

@ -23,6 +23,7 @@ import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
@ -183,8 +184,59 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
checkExpectedStringTermsFound(topTerms);
}
}
@Test
public void badFilteredAnalysis() throws Exception {
// Deliberately using a bad choice of filter here for the background context in order
// to test robustness.
// We search for the name of a snowboarder but use music-related content (fact_category:1)
// as the background source of term statistics.
SearchResponse response = client().prepareSearch("test")
.setSearchType(SearchType.QUERY_AND_FETCH)
.setQuery(new TermQueryBuilder("_all", "terje"))
.setFrom(0).setSize(60).setExplain(true)
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
.minDocCount(2).backgroundFilter(FilterBuilders.termFilter("fact_category", 1)))
.execute()
.actionGet();
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
// We expect at least one of the significant terms to have been selected on the basis
// that it is present in the foreground selection but entirely missing from the filtered
// background used as context.
boolean hasMissingBackgroundTerms = false;
for (Bucket topTerm : topTerms) {
if (topTerm.getSupersetDf() == 0) {
hasMissingBackgroundTerms = true;
break;
}
}
assertTrue(hasMissingBackgroundTerms);
}
@Test
public void filteredAnalysis() throws Exception {
SearchResponse response = client().prepareSearch("test")
.setSearchType(SearchType.QUERY_AND_FETCH)
.setQuery(new TermQueryBuilder("_all", "weller"))
.setFrom(0).setSize(60).setExplain(true)
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
.minDocCount(1).backgroundFilter(FilterBuilders.termsFilter("description", "paul")))
.execute()
.actionGet();
assertSearchResponse(response);
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
HashSet<String> topWords = new HashSet<String>();
for (Bucket topTerm : topTerms) {
topWords.add(topTerm.getKey());
}
//The word "paul" should be a constant of all docs in the background set and therefore not seen as significant
assertFalse(topWords.contains("paul"));
//"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls.
assertTrue(topWords.contains("jam"));
}
@Test
public void nestedAggs() throws Exception {
String[][] expectedKeywordsByCategory={