New aggregations feature - “PercentageScore” heuristic for significant_terms aggregation provides simple “per-capita” type measures.

Closes #9720
2025-03-24 17:09:48 +00:00 · 2015-02-18 15:06:10 +00:00 · 2015-02-18 15:06:10 +00:00 · 29b1902cfb
commit 29b1902cfb
parent 50b9a8d6f2
7 changed files with 161 additions and 4 deletions
--- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
@ -320,6 +320,24 @@ Google normalized distance  as described in "The Google Similarity Distance", Ci

 `gnd` also accepts the `background_is_superset` parameter. 

+
+===== Percentage
+A simple calculation of the number of documents in the foreground sample with a term divided by the number of documents in the background with the term.
+By default this produces a score greater than zero and less than one.
+
+The benefit of this heuristic is that the scoring logic is simple to explain to anyone familiar with a "per capita" statistic. However, for fields with high cardinality there is a tendency for this heuristic to select the rarest terms such as typos that occur only once because they score 1/1 = 100%.
+
+It would be hard for a seasoned boxer to win a championship if the prize was awarded purely on the basis of percentage of fights won - by these rules a newcomer with only one fight under his belt would be impossible to beat.
+Multiple observations are typically required to reinforce a view so it is recommended in these cases to set both `min_doc_count` and `shard_min_doc_count` to a higher value such as 10 in order to filter out the low-frequency terms that otherwise take precedence.
+	
+[source,js]
+--------------------------------------------------
+
+	 "percentage": {
+	 }
+--------------------------------------------------
+
+
 ===== Which one is best?


--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/PercentageScore.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/PercentageScore.java
@ -0,0 +1,101 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
+
+
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.index.query.QueryParsingException;
+
+import java.io.IOException;
+
+public class PercentageScore extends SignificanceHeuristic {
+
+    public static final PercentageScore INSTANCE = new PercentageScore();
+
+    protected static final String[] NAMES = {"percentage"};
+
+    private PercentageScore() {};
+
+    public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
+        @Override
+        public SignificanceHeuristic readResult(StreamInput in) throws IOException {
+            return readFrom(in);
+        }
+
+        @Override
+        public String getName() {
+            return NAMES[0];
+        }
+    };
+
+    public static SignificanceHeuristic readFrom(StreamInput in) throws IOException {
+        return INSTANCE;
+    }
+
+    /**
+     * Indicates the significance of a term in a sample by determining what percentage
+     * of all occurrences of a term are found in the sample. 
+     */
+    @Override
+    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
+        checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "PercentageScore");
+        if (supersetFreq == 0) {
+            // avoid a divide by zero issue
+            return 0;
+        }        
+        return (double) subsetFreq / (double) supersetFreq;
+   }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeString(STREAM.getName());
+    }
+
+    public static class PercentageScoreParser implements SignificanceHeuristicParser {
+
+        @Override
+        public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
+            // move to the closing bracket
+            if (!parser.nextToken().equals(XContentParser.Token.END_OBJECT)) {
+                throw new ElasticsearchParseException("expected }, got " + parser.currentName() + " instead in percentage score");
+            }
+            return new PercentageScore();
+        }
+
+        @Override
+        public String[] getNames() {
+            return NAMES;
+        }
+    }
+
+    public static class PercentageScoreBuilder implements SignificanceHeuristicBuilder {
+
+        @Override
+        public void toXContent(XContentBuilder builder) throws IOException {
+            builder.startObject(STREAM.getName()).endObject();
+        }
+    }
+}
+
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java
@ -42,10 +42,10 @@ public abstract class SignificanceHeuristic {
            throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()");
        }
        if (subsetFreq > subsetSize) {
-            throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
+            throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in " + scoreFunctionName);
        }
        if (supersetFreq > supersetSize) {
-            throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
+            throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in " + scoreFunctionName);
        }
    }
 }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java
@ -21,6 +21,7 @@
 package org.elasticsearch.search.aggregations.bucket.significant.heuristics;

 import com.google.common.collect.Lists;
+
 import org.elasticsearch.common.inject.AbstractModule;
 import org.elasticsearch.common.inject.multibindings.Multibinder;

@ -33,6 +34,7 @@ public class SignificantTermsHeuristicModule extends AbstractModule {

    public SignificantTermsHeuristicModule() {
        registerParser(JLHScore.JLHScoreParser.class);
+        registerParser(PercentageScore.PercentageScoreParser.class);
        registerParser(MutualInformation.MutualInformationParser.class);
        registerParser(GND.GNDParser.class);
        registerParser(ChiSquare.ChiSquareParser.class);
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/TransportSignificantTermsHeuristicModule.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/TransportSignificantTermsHeuristicModule.java
@ -21,6 +21,7 @@
 package org.elasticsearch.search.aggregations.bucket.significant.heuristics;

 import com.google.common.collect.Lists;
+
 import org.elasticsearch.common.inject.AbstractModule;

 import java.util.List;
@ -32,6 +33,7 @@ public class TransportSignificantTermsHeuristicModule extends AbstractModule {

    public TransportSignificantTermsHeuristicModule() {
        registerStream(JLHScore.STREAM);
+        registerStream(PercentageScore.STREAM);
        registerStream(MutualInformation.STREAM);
        registerStream(GND.STREAM);
        registerStream(ChiSquare.STREAM);
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
@ -33,6 +33,7 @@ import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSq
 import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
 import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
 import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore;
 import org.elasticsearch.search.aggregations.bucket.terms.Terms;
 import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
@ -272,6 +273,23 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
        checkExpectedStringTermsFound(topTerms);
    }

+    @Test
+    public void textAnalysisPercentageScore() throws Exception {
+        SearchResponse response = client()
+                .prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "terje"))
+                .setFrom(0)
+                .setSize(60)
+                .setExplain(true)
+                .addAggregation(
+                        new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint())
+                                .significanceHeuristic(new PercentageScore.PercentageScoreBuilder()).minDocCount(2)).execute().actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        checkExpectedStringTermsFound(topTerms);
+    }
+
    @Test
    public void badFilteredAnalysis() throws Exception {
        // Deliberately using a bad choice of filter here for the background context in order
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
@ -30,7 +30,16 @@ import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.json.JsonXContent;
 import org.elasticsearch.search.SearchShardTarget;
 import org.elasticsearch.search.aggregations.InternalAggregations;
-import org.elasticsearch.search.aggregations.bucket.significant.heuristics.*;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicBuilder;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParserMapper;
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicStreams;
 import org.elasticsearch.search.internal.SearchContext;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.ElasticsearchTestCase;
@ -45,7 +54,11 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

-import static org.hamcrest.Matchers.*;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.lessThan;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;

 /**
 *
@ -68,6 +81,7 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
    public void streamResponse() throws Exception {
        SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName());
        SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName());
+        SignificanceHeuristicStreams.registerStream(PercentageScore.STREAM, PercentageScore.STREAM.getName());
        SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName());
        SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName());
        Version version = ElasticsearchIntegrationTest.randomVersion();
@ -304,6 +318,7 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
        testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false));
        testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false));
        testBackgroundAssertions(new GND(true), new GND(false));
+        testAssertions(PercentageScore.INSTANCE);
        testAssertions(JLHScore.INSTANCE);
    }

@ -311,6 +326,7 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
    public void basicScoreProperties() {
        basicScoreProperties(JLHScore.INSTANCE, true);
        basicScoreProperties(new GND(true), true);
+        basicScoreProperties(PercentageScore.INSTANCE, true);
        basicScoreProperties(new MutualInformation(true, true), false);
        basicScoreProperties(new ChiSquare(true, true), false);
    }