diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index a16324881b3..3deef808b5c 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -233,13 +233,19 @@ free-text field and use them in a `terms` query on the same field with a `highli are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent. ============ -==== Limitations +==== Custom background sets +added[1.2.0] -===== Single _background_ comparison base -The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying -a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different -background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look -for significant terms in a subset of that content which is from this week. + +Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index. +However, sometimes it may prove useful to use a narrower background set as the basis for comparisons. +For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish" +was a significant term. This may be true but if you want some more focused terms you could use a `background_filter` +on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now +be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid. +Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term. + +==== Limitations ===== Significant terms must be indexed values Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes. @@ -337,6 +343,37 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to +===== Custom background context + +The default source of statistical information for background term frequencies is the entire index and this +scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower +context: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match" : "madrid" + }, + "aggs" : { + "tags" : { + "significant_terms" : { + "field" : "tag", + "background_filter": { + "term" : { "text" : "spain"} + } + } + } + } +} +-------------------------------------------------- + +The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing +terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the +word "Spain". + +WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency + ===== Filtering Values diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java index 46face19a3c..867ac004ba9 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java @@ -97,7 +97,6 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri spare.subsetSize = subsetSize; spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes); spare.supersetSize = supersetSize; - assert spare.subsetDf <= spare.supersetDf; // During shard-local down-selection we use subset/superset stats // that are for this shard only // Back at the central reducer these properties will be updated with diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java index afa8ab240af..6526f3a5693 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java @@ -54,7 +54,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) { super(subsetDf, subsetSize, supersetDf, supersetSize); this.aggregations = aggregations; - assert subsetDf <= supersetDf; updateScore(); } @@ -96,7 +95,12 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple // avoid any divide by zero issues return 0; } - + if (supersetFreq == 0) { + // If we are using a background context that is not a strict superset, a foreground + // term may be missing from the background, so for the purposes of this calculation + // we assume a value of 1 for our calculations which avoids returning an "infinity" result + supersetFreq = 1; + } double subsetProbability = (double) subsetFreq / (double) subsetSize; double supersetProbability = (double) supersetFreq / (double) supersetSize; @@ -154,7 +158,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple } aggregationsList.add(bucket.aggregations); } - assert reduced.subsetDf <= reduced.supersetDf; reduced.aggregations = InternalAggregations.reduce(aggregationsList, bigArrays); return reduced; } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index 4782a71384e..67d05200586 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -84,7 +84,6 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator { spare.subsetSize = subsetSize; spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes); spare.supersetSize = supersetSize; - assert spare.subsetDf <= spare.supersetDf; // During shard-local down-selection we use subset/superset stats // that are for this shard only // Back at the central reducer these properties will be updated with diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java index 4e7eb104573..d6aa778a451 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java @@ -20,6 +20,7 @@ package org.elasticsearch.search.aggregations.bucket.significant; import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.FilterBuilder; import org.elasticsearch.search.aggregations.AggregationBuilder; import java.io.IOException; @@ -42,6 +43,8 @@ public class SignificantTermsBuilder extends AggregationBuilder topWords = new HashSet(); + for (Bucket topTerm : topTerms) { + topWords.add(topTerm.getKey()); + } + //The word "paul" should be a constant of all docs in the background set and therefore not seen as significant + assertFalse(topWords.contains("paul")); + //"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls. + assertTrue(topWords.contains("jam")); + } + @Test public void nestedAggs() throws Exception { String[][] expectedKeywordsByCategory={