Significant_terms agg: added option for a background_filter to define background context for analysis of term frequencies

Closes #5944
2014-04-02 13:58:15 +01:00 · 2014-04-02 13:58:15 +01:00 · 1e560b0d92
parent 3484ca3737
commit 1e560b0d92
7 changed files with 123 additions and 24 deletions
--- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
@ -233,13 +233,19 @@ free-text field and use them in a `terms` query on the same field with a `highli
 are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent.
 ============

-==== Limitations
+==== Custom background sets
+added[1.2.0]

-===== Single _background_ comparison base
-The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying
-a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different
-background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look
-for significant terms in a subset of that content which is from this week.
+
+Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index.
+However, sometimes it may prove useful to use a narrower background set as the basis for comparisons. 
+For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish" 
+was a significant term. This may be true but if you want some more focused terms you could use a `background_filter` 
+on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now 
+be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.  
+Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.  
+
+==== Limitations

 ===== Significant terms must be indexed values
 Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes.
@ -337,6 +343,37 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to



+===== Custom background context
+
+The default source of statistical information for background term frequencies is the entire index and this
+scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower
+context: 
+
+[source,js]
+--------------------------------------------------
+{
+    "query" : {
+        "match" : "madrid"
+    },
+    "aggs" : {
+        "tags" : {
+            "significant_terms" : { 
+                "field" : "tag",
+                "background_filter": {
+                	"term" : { "text" : "spain"}
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing 
+terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the 
+word "Spain".   
+
+WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency
+

 ===== Filtering Values

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
@ -97,7 +97,6 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
            spare.subsetSize = subsetSize;
            spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
            spare.supersetSize = supersetSize;
-            assert spare.subsetDf <= spare.supersetDf;
            // During shard-local down-selection we use subset/superset stats
            // that are for this shard only
            // Back at the central reducer these properties will be updated with
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java
@ -54,7 +54,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
        protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) {
            super(subsetDf, subsetSize, supersetDf, supersetSize);
            this.aggregations = aggregations;
-            assert subsetDf <= supersetDf;
            updateScore();
        }

@ -96,7 +95,12 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
                // avoid any divide by zero issues
                return 0;
            }
-
+            if (supersetFreq == 0) {
+                // If we are using a background context that is not a strict superset, a foreground 
+                // term may be missing from the background, so for the purposes of this calculation
+                // we assume a value of 1 for our calculations which avoids returning an "infinity" result
+                supersetFreq = 1;
+            }
            double subsetProbability = (double) subsetFreq / (double) subsetSize;
            double supersetProbability = (double) supersetFreq / (double) supersetSize;

@ -154,7 +158,6 @@ public abstract class InternalSignificantTerms extends InternalAggregation imple
                }
                aggregationsList.add(bucket.aggregations);
            }
-            assert reduced.subsetDf <= reduced.supersetDf;
            reduced.aggregations = InternalAggregations.reduce(aggregationsList, bigArrays);
            return reduced;
        }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@ -84,7 +84,6 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
            spare.subsetSize = subsetSize;
            spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
            spare.supersetSize = supersetSize;
-            assert spare.subsetDf <= spare.supersetDf;
            // During shard-local down-selection we use subset/superset stats 
            // that are for this shard only
            // Back at the central reducer these properties will be updated with
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java
@ -20,6 +20,7 @@
 package org.elasticsearch.search.aggregations.bucket.significant;

 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.FilterBuilder;
 import org.elasticsearch.search.aggregations.AggregationBuilder;

 import java.io.IOException;
@ -42,6 +43,8 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
    private int includeFlags;
    private String excludePattern;
    private int excludeFlags;
+    private FilterBuilder filterBuilder;
+

    public SignificantTermsBuilder(String name) {
        super(name, SignificantStringTerms.TYPE.name());
@ -66,6 +69,12 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
        this.minDocCount = minDocCount;
        return this;
    }
+    
+    public SignificantTermsBuilder backgroundFilter(FilterBuilder filter) {
+        this.filterBuilder = filter;
+        return this;
+    }
+    

    public SignificantTermsBuilder shardMinDocCount(int shardMinDocCount) {
        this.shardMinDocCount = shardMinDocCount;
@ -162,6 +171,11 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
                        .endObject();
            }
        }
+        
+        if (filterBuilder != null) {
+            builder.field(SignificantTermsParser.BACKGROUND_FILTER.getPreferredName());
+            filterBuilder.toXContent(builder, params); 
+        }

        return builder.endObject();
    }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
@ -41,6 +41,8 @@ public class SignificantTermsParser implements Aggregator.Parser {

    //Typically need more than one occurrence of something for it to be statistically significant
    public static final int DEFAULT_MIN_DOC_COUNT = 3;
+    
+    static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");

    static final ParseField SHARD_MIN_DOC_COUNT_FIELD_NAME = new ParseField("shard_min_doc_count");
    public static final int DEFAULT_SHARD_MIN_DOC_COUNT = 1;
@ -99,18 +101,11 @@ public class SignificantTermsParser implements Aggregator.Parser {

                }
            } else if (token == XContentParser.Token.START_OBJECT) {
-                // TODO not sure if code below is the best means to declare a filter for 
-                // defining an alternative background stats context.
-                // In trial runs it becomes obvious that the choice of background does have to  
-                // be a strict superset of the foreground subset otherwise the significant terms algo
-                // immediately singles out the odd terms that are in the foreground but not represented
-                // in the background. So a better approach may be to use a designated parent agg as the  
-                // background because parent aggs are always guaranteed to be a superset whereas arbitrary
-                // filters defined by end users and parsed below are not.
-//                if ("background_context".equals(currentFieldName)) {
-//                    filter = context.queryParserService().parseInnerFilter(parser).filter();
-//                }
-
+                if (BACKGROUND_FILTER.match(currentFieldName)) {
+                    filter = context.queryParserService().parseInnerFilter(parser).filter();
+                } else {
+                    throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");                    
+                }
            } else {
                throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
            }
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
@ -23,6 +23,7 @@ import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.action.search.SearchType;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.query.FilterBuilders;
 import org.elasticsearch.index.query.TermQueryBuilder;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
@ -183,8 +184,59 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
        assertSearchResponse(response);
        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
        checkExpectedStringTermsFound(topTerms);
-    }    
+    }   
    
+    @Test
+    public void badFilteredAnalysis() throws Exception {
+        // Deliberately using a bad choice of filter here for the background context in order
+        // to test robustness. 
+        // We search for the name of a snowboarder but use music-related content (fact_category:1)
+        // as the background source of term statistics.
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "terje"))
+                .setFrom(0).setSize(60).setExplain(true)                
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
+                           .minDocCount(2).backgroundFilter(FilterBuilders.termFilter("fact_category", 1)))
+                .execute()
+                .actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        // We expect at least one of the significant terms to have been selected on the basis
+        // that it is present in the foreground selection but entirely missing from the filtered
+        // background used as context.
+        boolean hasMissingBackgroundTerms = false;
+        for (Bucket topTerm : topTerms) {
+            if (topTerm.getSupersetDf() == 0) {
+                hasMissingBackgroundTerms = true;
+                break;
+            }
+        }
+        assertTrue(hasMissingBackgroundTerms);
+    }       
+    
+    @Test
+    public void filteredAnalysis() throws Exception {
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "weller"))
+                .setFrom(0).setSize(60).setExplain(true)                
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
+                           .minDocCount(1).backgroundFilter(FilterBuilders.termsFilter("description",  "paul")))
+                .execute()
+                .actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        HashSet<String> topWords = new HashSet<String>();
+        for (Bucket topTerm : topTerms) {
+            topWords.add(topTerm.getKey());
+        }
+        //The word "paul" should be a constant of all docs in the background set and therefore not seen as significant 
+        assertFalse(topWords.contains("paul"));
+        //"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls. 
+        assertTrue(topWords.contains("jam"));
+    }       
+
    @Test
    public void nestedAggs() throws Exception {
        String[][] expectedKeywordsByCategory={