mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-29 11:28:30 +00:00
New aggregations feature - “PercentageScore” heuristic for significant_terms aggregation provides simple “per-capita” type measures.
Closes #9720
This commit is contained in:
parent
50b9a8d6f2
commit
29b1902cfb
@ -320,6 +320,24 @@ Google normalized distance as described in "The Google Similarity Distance", Ci
|
|||||||
|
|
||||||
`gnd` also accepts the `background_is_superset` parameter.
|
`gnd` also accepts the `background_is_superset` parameter.
|
||||||
|
|
||||||
|
|
||||||
|
===== Percentage
|
||||||
|
A simple calculation of the number of documents in the foreground sample with a term divided by the number of documents in the background with the term.
|
||||||
|
By default this produces a score greater than zero and less than one.
|
||||||
|
|
||||||
|
The benefit of this heuristic is that the scoring logic is simple to explain to anyone familiar with a "per capita" statistic. However, for fields with high cardinality there is a tendency for this heuristic to select the rarest terms such as typos that occur only once because they score 1/1 = 100%.
|
||||||
|
|
||||||
|
It would be hard for a seasoned boxer to win a championship if the prize was awarded purely on the basis of percentage of fights won - by these rules a newcomer with only one fight under his belt would be impossible to beat.
|
||||||
|
Multiple observations are typically required to reinforce a view so it is recommended in these cases to set both `min_doc_count` and `shard_min_doc_count` to a higher value such as 10 in order to filter out the low-frequency terms that otherwise take precedence.
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
"percentage": {
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
===== Which one is best?
|
===== Which one is best?
|
||||||
|
|
||||||
|
|
||||||
|
@ -0,0 +1,101 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||||
|
|
||||||
|
|
||||||
|
import org.elasticsearch.ElasticsearchParseException;
|
||||||
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
|
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||||
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
|
import org.elasticsearch.index.query.QueryParsingException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class PercentageScore extends SignificanceHeuristic {
|
||||||
|
|
||||||
|
public static final PercentageScore INSTANCE = new PercentageScore();
|
||||||
|
|
||||||
|
protected static final String[] NAMES = {"percentage"};
|
||||||
|
|
||||||
|
private PercentageScore() {};
|
||||||
|
|
||||||
|
public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
|
||||||
|
@Override
|
||||||
|
public SignificanceHeuristic readResult(StreamInput in) throws IOException {
|
||||||
|
return readFrom(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getName() {
|
||||||
|
return NAMES[0];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public static SignificanceHeuristic readFrom(StreamInput in) throws IOException {
|
||||||
|
return INSTANCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates the significance of a term in a sample by determining what percentage
|
||||||
|
* of all occurrences of a term are found in the sample.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
|
||||||
|
checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "PercentageScore");
|
||||||
|
if (supersetFreq == 0) {
|
||||||
|
// avoid a divide by zero issue
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return (double) subsetFreq / (double) supersetFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeTo(StreamOutput out) throws IOException {
|
||||||
|
out.writeString(STREAM.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class PercentageScoreParser implements SignificanceHeuristicParser {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
|
||||||
|
// move to the closing bracket
|
||||||
|
if (!parser.nextToken().equals(XContentParser.Token.END_OBJECT)) {
|
||||||
|
throw new ElasticsearchParseException("expected }, got " + parser.currentName() + " instead in percentage score");
|
||||||
|
}
|
||||||
|
return new PercentageScore();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String[] getNames() {
|
||||||
|
return NAMES;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class PercentageScoreBuilder implements SignificanceHeuristicBuilder {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void toXContent(XContentBuilder builder) throws IOException {
|
||||||
|
builder.startObject(STREAM.getName()).endObject();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -42,10 +42,10 @@ public abstract class SignificanceHeuristic {
|
|||||||
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()");
|
throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()");
|
||||||
}
|
}
|
||||||
if (subsetFreq > subsetSize) {
|
if (subsetFreq > subsetSize) {
|
||||||
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)");
|
throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in " + scoreFunctionName);
|
||||||
}
|
}
|
||||||
if (supersetFreq > supersetSize) {
|
if (supersetFreq > supersetSize) {
|
||||||
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)");
|
throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in " + scoreFunctionName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.AbstractModule;
|
import org.elasticsearch.common.inject.AbstractModule;
|
||||||
import org.elasticsearch.common.inject.multibindings.Multibinder;
|
import org.elasticsearch.common.inject.multibindings.Multibinder;
|
||||||
|
|
||||||
@ -33,6 +34,7 @@ public class SignificantTermsHeuristicModule extends AbstractModule {
|
|||||||
|
|
||||||
public SignificantTermsHeuristicModule() {
|
public SignificantTermsHeuristicModule() {
|
||||||
registerParser(JLHScore.JLHScoreParser.class);
|
registerParser(JLHScore.JLHScoreParser.class);
|
||||||
|
registerParser(PercentageScore.PercentageScoreParser.class);
|
||||||
registerParser(MutualInformation.MutualInformationParser.class);
|
registerParser(MutualInformation.MutualInformationParser.class);
|
||||||
registerParser(GND.GNDParser.class);
|
registerParser(GND.GNDParser.class);
|
||||||
registerParser(ChiSquare.ChiSquareParser.class);
|
registerParser(ChiSquare.ChiSquareParser.class);
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import org.elasticsearch.common.inject.AbstractModule;
|
import org.elasticsearch.common.inject.AbstractModule;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -32,6 +33,7 @@ public class TransportSignificantTermsHeuristicModule extends AbstractModule {
|
|||||||
|
|
||||||
public TransportSignificantTermsHeuristicModule() {
|
public TransportSignificantTermsHeuristicModule() {
|
||||||
registerStream(JLHScore.STREAM);
|
registerStream(JLHScore.STREAM);
|
||||||
|
registerStream(PercentageScore.STREAM);
|
||||||
registerStream(MutualInformation.STREAM);
|
registerStream(MutualInformation.STREAM);
|
||||||
registerStream(GND.STREAM);
|
registerStream(GND.STREAM);
|
||||||
registerStream(ChiSquare.STREAM);
|
registerStream(ChiSquare.STREAM);
|
||||||
|
@ -33,6 +33,7 @@ import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSq
|
|||||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
|
||||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
|
||||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore;
|
||||||
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
|
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
|
||||||
import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder;
|
import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder;
|
||||||
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
||||||
@ -272,6 +273,23 @@ public class SignificantTermsTests extends ElasticsearchIntegrationTest {
|
|||||||
checkExpectedStringTermsFound(topTerms);
|
checkExpectedStringTermsFound(topTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void textAnalysisPercentageScore() throws Exception {
|
||||||
|
SearchResponse response = client()
|
||||||
|
.prepareSearch("test")
|
||||||
|
.setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
.setQuery(new TermQueryBuilder("_all", "terje"))
|
||||||
|
.setFrom(0)
|
||||||
|
.setSize(60)
|
||||||
|
.setExplain(true)
|
||||||
|
.addAggregation(
|
||||||
|
new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||||
|
.significanceHeuristic(new PercentageScore.PercentageScoreBuilder()).minDocCount(2)).execute().actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||||
|
checkExpectedStringTermsFound(topTerms);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void badFilteredAnalysis() throws Exception {
|
public void badFilteredAnalysis() throws Exception {
|
||||||
// Deliberately using a bad choice of filter here for the background context in order
|
// Deliberately using a bad choice of filter here for the background context in order
|
||||||
|
@ -30,7 +30,16 @@ import org.elasticsearch.common.xcontent.XContentParser;
|
|||||||
import org.elasticsearch.common.xcontent.json.JsonXContent;
|
import org.elasticsearch.common.xcontent.json.JsonXContent;
|
||||||
import org.elasticsearch.search.SearchShardTarget;
|
import org.elasticsearch.search.SearchShardTarget;
|
||||||
import org.elasticsearch.search.aggregations.InternalAggregations;
|
import org.elasticsearch.search.aggregations.InternalAggregations;
|
||||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.*;
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicBuilder;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParserMapper;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicStreams;
|
||||||
import org.elasticsearch.search.internal.SearchContext;
|
import org.elasticsearch.search.internal.SearchContext;
|
||||||
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
||||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
@ -45,7 +54,11 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import static org.hamcrest.Matchers.*;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.hamcrest.Matchers.greaterThan;
|
||||||
|
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
||||||
|
import static org.hamcrest.Matchers.lessThan;
|
||||||
|
import static org.hamcrest.Matchers.lessThanOrEqualTo;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
@ -68,6 +81,7 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||||||
public void streamResponse() throws Exception {
|
public void streamResponse() throws Exception {
|
||||||
SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName());
|
SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName());
|
||||||
SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName());
|
SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName());
|
||||||
|
SignificanceHeuristicStreams.registerStream(PercentageScore.STREAM, PercentageScore.STREAM.getName());
|
||||||
SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName());
|
SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName());
|
||||||
SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName());
|
SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName());
|
||||||
Version version = ElasticsearchIntegrationTest.randomVersion();
|
Version version = ElasticsearchIntegrationTest.randomVersion();
|
||||||
@ -304,6 +318,7 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||||||
testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false));
|
testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false));
|
||||||
testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false));
|
testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false));
|
||||||
testBackgroundAssertions(new GND(true), new GND(false));
|
testBackgroundAssertions(new GND(true), new GND(false));
|
||||||
|
testAssertions(PercentageScore.INSTANCE);
|
||||||
testAssertions(JLHScore.INSTANCE);
|
testAssertions(JLHScore.INSTANCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -311,6 +326,7 @@ public class SignificanceHeuristicTests extends ElasticsearchTestCase {
|
|||||||
public void basicScoreProperties() {
|
public void basicScoreProperties() {
|
||||||
basicScoreProperties(JLHScore.INSTANCE, true);
|
basicScoreProperties(JLHScore.INSTANCE, true);
|
||||||
basicScoreProperties(new GND(true), true);
|
basicScoreProperties(new GND(true), true);
|
||||||
|
basicScoreProperties(PercentageScore.INSTANCE, true);
|
||||||
basicScoreProperties(new MutualInformation(true, true), false);
|
basicScoreProperties(new MutualInformation(true, true), false);
|
||||||
basicScoreProperties(new ChiSquare(true, true), false);
|
basicScoreProperties(new ChiSquare(true, true), false);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user