Added unit test coverage for SignificantTerms (#24904)
Added unit test coverage for GlobalOrdinalsSignificantTermsAggregator, GlobalOrdinalsSignificantTermsAggregator.WithHash, SignificantLongTermsAggregator and SignificantStringTermsAggregator. Removed integration test. Relates #22278
This commit is contained in:
parent
9ff1698aa7
commit
973530f953
|
@ -1,473 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.search.aggregations.bucket;
|
||||
|
||||
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
|
||||
import org.elasticsearch.action.search.SearchPhaseExecutionException;
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.action.search.SearchType;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.elasticsearch.index.query.TermQueryBuilder;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore;
|
||||
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
|
||||
import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
|
||||
import org.elasticsearch.test.ESIntegTestCase;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
|
||||
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
|
||||
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
|
||||
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
|
||||
import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms;
|
||||
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.hasSize;
|
||||
import static org.hamcrest.Matchers.is;
|
||||
import static org.hamcrest.core.IsNull.notNullValue;
|
||||
|
||||
@ESIntegTestCase.SuiteScopeTestCase
|
||||
public class SignificantTermsIT extends ESIntegTestCase {
|
||||
|
||||
public String randomExecutionHint() {
|
||||
return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Settings indexSettings() {
|
||||
return Settings.builder()
|
||||
.put("index.number_of_shards", numberOfShards())
|
||||
.put("index.number_of_replicas", numberOfReplicas())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static final int MUSIC_CATEGORY=1;
|
||||
public static final int OTHER_CATEGORY=2;
|
||||
public static final int SNOWBOARDING_CATEGORY=3;
|
||||
|
||||
@Override
|
||||
public void setupSuiteScopeCluster() throws Exception {
|
||||
assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact",
|
||||
"_routing", "required=true", "routing_id", "type=keyword", "fact_category",
|
||||
"type=integer", "description", "type=text,fielddata=true"));
|
||||
createIndex("idx_unmapped");
|
||||
|
||||
ensureGreen();
|
||||
String data[] = {
|
||||
"A\t1\tpaul weller was lead singer of the jam before the style council",
|
||||
"B\t1\tpaul weller left the jam to form the style council",
|
||||
"A\t2\tpaul smith is a designer in the fashion industry",
|
||||
"B\t1\tthe stranglers are a group originally from guildford",
|
||||
"A\t1\tafter disbanding the style council in 1985 paul weller became a solo artist",
|
||||
"B\t1\tjean jaques burnel is a bass player in the stranglers and has a black belt in karate",
|
||||
"A\t1\tmalcolm owen was the lead singer of the ruts",
|
||||
"B\t1\tpaul weller has denied any possibility of a reunion of the jam",
|
||||
"A\t1\tformer frontman of the jam paul weller became the father of twins",
|
||||
"B\t2\tex-england football star paul gascoigne has re-emerged following recent disappearance",
|
||||
"A\t2\tdavid smith has recently denied connections with the mafia",
|
||||
"B\t1\tthe damned's new rose single was considered the first 'punk' single in the UK",
|
||||
"A\t1\tthe sex pistols broke up after a few short years together",
|
||||
"B\t1\tpaul gascoigne was a midfielder for england football team",
|
||||
"A\t3\tcraig kelly became the first world champion snowboarder and has a memorial at baldface lodge",
|
||||
"B\t3\tterje haakonsen has credited craig kelly as his snowboard mentor",
|
||||
"A\t3\tterje haakonsen and craig kelly were some of the first snowboarders sponsored by burton snowboards",
|
||||
"B\t3\tlike craig kelly before him terje won the mt baker banked slalom many times - once riding switch",
|
||||
"A\t3\tterje haakonsen has been a team rider for burton snowboards for over 20 years"
|
||||
};
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
String[] parts = data[i].split("\t");
|
||||
client().prepareIndex("test", "fact", "" + i)
|
||||
.setRouting(parts[0])
|
||||
.setSource("fact_category", parts[1], "description", parts[2]).get();
|
||||
}
|
||||
client().admin().indices().refresh(new RefreshRequest("test")).get();
|
||||
|
||||
assertAcked(prepareCreate("test_not_indexed")
|
||||
.setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.addMapping("type",
|
||||
"my_keyword", "type=keyword,index=false",
|
||||
"my_long", "type=long,index=false"));
|
||||
indexRandom(true,
|
||||
client().prepareIndex("test_not_indexed", "type", "1").setSource(
|
||||
"my_keyword", "foo", "my_long", 42));
|
||||
}
|
||||
|
||||
public void testStructuredAnalysis() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint())
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
|
||||
assertTrue(topCategory.equals(Long.valueOf(SNOWBOARDING_CATEGORY)));
|
||||
}
|
||||
|
||||
public void testStructuredAnalysisWithIncludeExclude() throws Exception {
|
||||
long[] excludeTerms = { MUSIC_CATEGORY };
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "paul"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint())
|
||||
.minDocCount(1).includeExclude(new IncludeExclude(null, excludeTerms)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
|
||||
assertTrue(topCategory.equals(Long.valueOf(OTHER_CATEGORY)));
|
||||
}
|
||||
|
||||
public void testIncludeExclude() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setQuery(new TermQueryBuilder("description", "weller"))
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||
.includeExclude(new IncludeExclude(null, "weller")))
|
||||
.get();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
Set<String> terms = new HashSet<>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
terms.add(topTerm.getKeyAsString());
|
||||
}
|
||||
assertThat(terms, hasSize(6));
|
||||
assertThat(terms.contains("jam"), is(true));
|
||||
assertThat(terms.contains("council"), is(true));
|
||||
assertThat(terms.contains("style"), is(true));
|
||||
assertThat(terms.contains("paul"), is(true));
|
||||
assertThat(terms.contains("of"), is(true));
|
||||
assertThat(terms.contains("the"), is(true));
|
||||
|
||||
response = client().prepareSearch("test")
|
||||
.setQuery(new TermQueryBuilder("description", "weller"))
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||
.includeExclude(new IncludeExclude("weller", null)))
|
||||
.get();
|
||||
assertSearchResponse(response);
|
||||
topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
terms = new HashSet<>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
terms.add(topTerm.getKeyAsString());
|
||||
}
|
||||
assertThat(terms, hasSize(1));
|
||||
assertThat(terms.contains("weller"), is(true));
|
||||
}
|
||||
|
||||
public void testIncludeExcludeExactValues() throws Exception {
|
||||
String []incExcTerms={"weller","nosuchterm"};
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setQuery(new TermQueryBuilder("description", "weller"))
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||
.includeExclude(new IncludeExclude(null, incExcTerms)))
|
||||
.get();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
Set<String> terms = new HashSet<>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
terms.add(topTerm.getKeyAsString());
|
||||
}
|
||||
assertEquals(new HashSet<String>(Arrays.asList("jam", "council", "style", "paul", "of", "the")), terms);
|
||||
|
||||
response = client().prepareSearch("test")
|
||||
.setQuery(new TermQueryBuilder("description", "weller"))
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||
.includeExclude(new IncludeExclude(incExcTerms, null)))
|
||||
.get();
|
||||
assertSearchResponse(response);
|
||||
topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
terms = new HashSet<>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
terms.add(topTerm.getKeyAsString());
|
||||
}
|
||||
assertThat(terms, hasSize(1));
|
||||
assertThat(terms.contains("weller"), is(true));
|
||||
}
|
||||
|
||||
public void testUnmapped() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("idx_unmapped")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint())
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
assertThat(topTerms.getBuckets().size(), equalTo(0));
|
||||
}
|
||||
|
||||
public void testTextAnalysis() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testTextAnalysisGND() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new GND(true))
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testTextAnalysisChiSquare() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new ChiSquare(false,true))
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testTextAnalysisPercentageScore() throws Exception {
|
||||
SearchResponse response = client()
|
||||
.prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0)
|
||||
.setSize(60)
|
||||
.setExplain(true)
|
||||
.addAggregation(
|
||||
significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
|
||||
.significanceHeuristic(new PercentageScore()).minDocCount(2)).execute().actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testBadFilteredAnalysis() throws Exception {
|
||||
// Deliberately using a bad choice of filter here for the background context in order
|
||||
// to test robustness.
|
||||
// We search for the name of a snowboarder but use music-related content (fact_category:1)
|
||||
// as the background source of term statistics.
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description")
|
||||
.minDocCount(2).backgroundFilter(QueryBuilders.termQuery("fact_category", 1)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
// We expect at least one of the significant terms to have been selected on the basis
|
||||
// that it is present in the foreground selection but entirely missing from the filtered
|
||||
// background used as context.
|
||||
boolean hasMissingBackgroundTerms = false;
|
||||
for (Bucket topTerm : topTerms) {
|
||||
if (topTerm.getSupersetDf() == 0) {
|
||||
hasMissingBackgroundTerms = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(hasMissingBackgroundTerms);
|
||||
}
|
||||
|
||||
public void testFilteredAnalysis() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "weller"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description")
|
||||
.minDocCount(1).backgroundFilter(QueryBuilders.termsQuery("description", "paul")))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
HashSet<String> topWords = new HashSet<String>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
topWords.add(topTerm.getKeyAsString());
|
||||
}
|
||||
//The word "paul" should be a constant of all docs in the background set and therefore not seen as significant
|
||||
assertFalse(topWords.contains("paul"));
|
||||
//"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls.
|
||||
assertTrue(topWords.contains("jam"));
|
||||
}
|
||||
|
||||
public void testNestedAggs() throws Exception {
|
||||
String[][] expectedKeywordsByCategory={
|
||||
{ "paul", "weller", "jam", "style", "council" },
|
||||
{ "paul", "smith" },
|
||||
{ "craig", "kelly", "terje", "haakonsen", "burton" }};
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.addAggregation(terms("myCategories").field("fact_category").minDocCount(2)
|
||||
.subAggregation(
|
||||
significantTerms("mySignificantTerms").field("description")
|
||||
.executionHint(randomExecutionHint())
|
||||
.minDocCount(2)))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
Terms topCategoryTerms = response.getAggregations().get("myCategories");
|
||||
for (org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket topCategory : topCategoryTerms.getBuckets()) {
|
||||
SignificantTerms topTerms = topCategory.getAggregations().get("mySignificantTerms");
|
||||
HashSet<String> foundTopWords = new HashSet<String>();
|
||||
for (Bucket topTerm : topTerms) {
|
||||
foundTopWords.add(topTerm.getKeyAsString());
|
||||
}
|
||||
String[] expectedKeywords = expectedKeywordsByCategory[Integer.parseInt(topCategory.getKeyAsString()) - 1];
|
||||
for (String expectedKeyword : expectedKeywords) {
|
||||
assertTrue(expectedKeyword + " missing from category keywords", foundTopWords.contains(expectedKeyword));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testPartiallyUnmapped() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("idx_unmapped", "test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms").field("description")
|
||||
.executionHint(randomExecutionHint())
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testPartiallyUnmappedWithFormat() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("idx_unmapped", "test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(boolQuery().should(termQuery("description", "the")).should(termQuery("description", "terje")))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms")
|
||||
.field("fact_category")
|
||||
.executionHint(randomExecutionHint())
|
||||
.minDocCount(1)
|
||||
.format("0000"))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
for (int i = 1; i <= 3; i++) {
|
||||
String key = String.format(Locale.ROOT, "%04d", i);
|
||||
SignificantTerms.Bucket bucket = topTerms.getBucketByKey(key);
|
||||
assertThat(bucket, notNullValue());
|
||||
assertThat(bucket.getKeyAsString(), equalTo(key));
|
||||
}
|
||||
}
|
||||
|
||||
private void checkExpectedStringTermsFound(SignificantTerms topTerms) {
|
||||
HashMap<String,Bucket>topWords=new HashMap<>();
|
||||
for (Bucket topTerm : topTerms ){
|
||||
topWords.put(topTerm.getKeyAsString(), topTerm);
|
||||
}
|
||||
assertTrue( topWords.containsKey("haakonsen"));
|
||||
assertTrue( topWords.containsKey("craig"));
|
||||
assertTrue( topWords.containsKey("kelly"));
|
||||
assertTrue( topWords.containsKey("burton"));
|
||||
assertTrue( topWords.containsKey("snowboards"));
|
||||
Bucket kellyTerm=topWords.get("kelly");
|
||||
assertEquals(3, kellyTerm.getSubsetDf());
|
||||
assertEquals(4, kellyTerm.getSupersetDf());
|
||||
}
|
||||
|
||||
public void testDefaultSignificanceHeuristic() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms")
|
||||
.field("description")
|
||||
.executionHint(randomExecutionHint())
|
||||
.significanceHeuristic(new JLHScore())
|
||||
.minDocCount(2))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testMutualInformation() throws Exception {
|
||||
SearchResponse response = client().prepareSearch("test")
|
||||
.setSearchType(SearchType.QUERY_THEN_FETCH)
|
||||
.setQuery(new TermQueryBuilder("description", "terje"))
|
||||
.setFrom(0).setSize(60).setExplain(true)
|
||||
.addAggregation(significantTerms("mySignificantTerms")
|
||||
.field("description")
|
||||
.executionHint(randomExecutionHint())
|
||||
.significanceHeuristic(new MutualInformation(false, true))
|
||||
.minDocCount(1))
|
||||
.execute()
|
||||
.actionGet();
|
||||
assertSearchResponse(response);
|
||||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
|
||||
checkExpectedStringTermsFound(topTerms);
|
||||
}
|
||||
|
||||
public void testFailIfFieldNotIndexed() {
|
||||
SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class,
|
||||
() -> client().prepareSearch("test_not_indexed").addAggregation(
|
||||
significantTerms("mySignificantTerms").field("my_keyword")).get());
|
||||
assertThat(e.toString(),
|
||||
containsString("Cannot search on field [my_keyword] since it is not indexed."));
|
||||
|
||||
e = expectThrows(SearchPhaseExecutionException.class,
|
||||
() -> client().prepareSearch("test_not_indexed").addAggregation(
|
||||
significantTerms("mySignificantTerms").field("my_long")).get());
|
||||
assertThat(e.toString(),
|
||||
containsString("Cannot search on field [my_long] since it is not indexed."));
|
||||
}
|
||||
}
|
|
@ -135,7 +135,7 @@ public class SignificanceHeuristicTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
SignificanceHeuristic getRandomSignificanceheuristic() {
|
||||
public static SignificanceHeuristic getRandomSignificanceheuristic() {
|
||||
List<SignificanceHeuristic> heuristics = new ArrayList<>();
|
||||
heuristics.add(new JLHScore());
|
||||
heuristics.add(new MutualInformation(randomBoolean(), randomBoolean()));
|
||||
|
|
|
@ -19,23 +19,43 @@
|
|||
|
||||
package org.elasticsearch.search.aggregations.bucket.significant;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.mapper.NumberFieldMapper;
|
||||
import org.elasticsearch.index.mapper.NumberFieldMapper.NumberFieldType;
|
||||
import org.elasticsearch.index.mapper.NumberFieldMapper.NumberType;
|
||||
import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType;
|
||||
import org.elasticsearch.index.query.QueryBuilder;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.elasticsearch.search.aggregations.AggregatorFactory;
|
||||
import org.elasticsearch.search.aggregations.AggregatorTestCase;
|
||||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
|
||||
import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
|
||||
import org.elasticsearch.search.aggregations.support.ValueType;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class SignificantTermsAggregatorTests extends AggregatorTestCase {
|
||||
|
||||
|
@ -71,5 +91,199 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase {
|
|||
// be 0
|
||||
assertEquals(1, ((BooleanQuery) parsedQuery).getMinimumNumberShouldMatch());
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses the significant terms aggregation to find the keywords in text fields
|
||||
*/
|
||||
public void testSignificance() throws IOException {
|
||||
TextFieldType textFieldType = new TextFieldType();
|
||||
textFieldType.setName("text");
|
||||
textFieldType.setFielddata(true);
|
||||
textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
|
||||
|
||||
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
|
||||
try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
|
||||
addMixedTextDocs(textFieldType, w);
|
||||
|
||||
SignificantTermsAggregationBuilder sigAgg = new SignificantTermsAggregationBuilder("sig_text", null).field("text");
|
||||
sigAgg.executionHint(randomExecutionHint());
|
||||
if (randomBoolean()) {
|
||||
// Use a background filter which just happens to be same scope as whole-index.
|
||||
sigAgg.backgroundFilter(QueryBuilders.termsQuery("text", "common"));
|
||||
}
|
||||
|
||||
SignificantTermsAggregationBuilder sigNumAgg = new SignificantTermsAggregationBuilder("sig_number", null).field("long_field");
|
||||
sigNumAgg.executionHint(randomExecutionHint());
|
||||
|
||||
try (IndexReader reader = DirectoryReader.open(w)) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
|
||||
// Search "odd"
|
||||
SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
|
||||
|
||||
assertEquals(1, terms.getBuckets().size());
|
||||
assertNull(terms.getBucketByKey("even"));
|
||||
assertNull(terms.getBucketByKey("common"));
|
||||
assertNotNull(terms.getBucketByKey("odd"));
|
||||
|
||||
// Search even
|
||||
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigAgg, textFieldType);
|
||||
|
||||
assertEquals(1, terms.getBuckets().size());
|
||||
assertNull(terms.getBucketByKey("odd"));
|
||||
assertNull(terms.getBucketByKey("common"));
|
||||
assertNotNull(terms.getBucketByKey("even"));
|
||||
|
||||
// Search odd with regex includeexcludes
|
||||
sigAgg.includeExclude(new IncludeExclude("o.d", null));
|
||||
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
|
||||
assertEquals(1, terms.getBuckets().size());
|
||||
assertNotNull(terms.getBucketByKey("odd"));
|
||||
assertNull(terms.getBucketByKey("common"));
|
||||
assertNull(terms.getBucketByKey("even"));
|
||||
|
||||
// Search with string-based includeexcludes
|
||||
String oddStrings[] = new String[] {"odd", "weird"};
|
||||
String evenStrings[] = new String[] {"even", "regular"};
|
||||
|
||||
sigAgg.includeExclude(new IncludeExclude(oddStrings, evenStrings));
|
||||
sigAgg.significanceHeuristic(SignificanceHeuristicTests.getRandomSignificanceheuristic());
|
||||
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
|
||||
assertEquals(1, terms.getBuckets().size());
|
||||
assertNotNull(terms.getBucketByKey("odd"));
|
||||
assertNull(terms.getBucketByKey("weird"));
|
||||
assertNull(terms.getBucketByKey("common"));
|
||||
assertNull(terms.getBucketByKey("even"));
|
||||
assertNull(terms.getBucketByKey("regular"));
|
||||
|
||||
sigAgg.includeExclude(new IncludeExclude(evenStrings, oddStrings));
|
||||
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
|
||||
assertEquals(0, terms.getBuckets().size());
|
||||
assertNull(terms.getBucketByKey("odd"));
|
||||
assertNull(terms.getBucketByKey("weird"));
|
||||
assertNull(terms.getBucketByKey("common"));
|
||||
assertNull(terms.getBucketByKey("even"));
|
||||
assertNull(terms.getBucketByKey("regular"));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses the significant terms aggregation to find the keywords in numeric
|
||||
* fields
|
||||
*/
|
||||
public void testNumericSignificance() throws IOException {
|
||||
NumberFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
|
||||
longFieldType.setName("long_field");
|
||||
|
||||
TextFieldType textFieldType = new TextFieldType();
|
||||
textFieldType.setName("text");
|
||||
textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
|
||||
|
||||
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
|
||||
final long ODD_VALUE = 3;
|
||||
final long EVEN_VALUE = 6;
|
||||
final long COMMON_VALUE = 2;
|
||||
|
||||
try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
Document doc = new Document();
|
||||
if (i % 2 == 0) {
|
||||
addFields(doc, NumberType.LONG.createFields("long_field", ODD_VALUE, true, true, false));
|
||||
doc.add(new Field("text", "odd", textFieldType));
|
||||
} else {
|
||||
addFields(doc, NumberType.LONG.createFields("long_field", EVEN_VALUE, true, true, false));
|
||||
doc.add(new Field("text", "even", textFieldType));
|
||||
}
|
||||
addFields(doc, NumberType.LONG.createFields("long_field", COMMON_VALUE, true, true, false));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
|
||||
SignificantTermsAggregationBuilder sigNumAgg = new SignificantTermsAggregationBuilder("sig_number", null).field("long_field");
|
||||
sigNumAgg.executionHint(randomExecutionHint());
|
||||
|
||||
try (IndexReader reader = DirectoryReader.open(w)) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
|
||||
// Search "odd"
|
||||
SignificantLongTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigNumAgg, longFieldType);
|
||||
assertEquals(1, terms.getBuckets().size());
|
||||
|
||||
assertNull(terms.getBucketByKey(Long.toString(EVEN_VALUE)));
|
||||
assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE)));
|
||||
assertNotNull(terms.getBucketByKey(Long.toString(ODD_VALUE)));
|
||||
|
||||
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigNumAgg, longFieldType);
|
||||
assertEquals(1, terms.getBuckets().size());
|
||||
|
||||
assertNull(terms.getBucketByKey(Long.toString(ODD_VALUE)));
|
||||
assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE)));
|
||||
assertNotNull(terms.getBucketByKey(Long.toString(EVEN_VALUE)));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses the significant terms aggregation on an index with unmapped field
|
||||
*/
|
||||
public void testUnmapped() throws IOException {
|
||||
TextFieldType textFieldType = new TextFieldType();
|
||||
textFieldType.setName("text");
|
||||
textFieldType.setFielddata(true);
|
||||
textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
|
||||
|
||||
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
|
||||
try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
|
||||
addMixedTextDocs(textFieldType, w);
|
||||
|
||||
// Attempt aggregation on unmapped field
|
||||
SignificantTermsAggregationBuilder sigAgg = new SignificantTermsAggregationBuilder("sig_text", null).field("unmapped_field");
|
||||
sigAgg.executionHint(randomExecutionHint());
|
||||
|
||||
try (IndexReader reader = DirectoryReader.open(w)) {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
|
||||
// Search "odd"
|
||||
SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
|
||||
assertEquals(0, terms.getBuckets().size());
|
||||
|
||||
assertNull(terms.getBucketByKey("even"));
|
||||
assertNull(terms.getBucketByKey("common"));
|
||||
assertNull(terms.getBucketByKey("odd"));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addMixedTextDocs(TextFieldType textFieldType, IndexWriter w) throws IOException {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
Document doc = new Document();
|
||||
StringBuilder text = new StringBuilder("common ");
|
||||
if (i % 2 == 0) {
|
||||
text.append("odd ");
|
||||
} else {
|
||||
text.append("even ");
|
||||
}
|
||||
|
||||
doc.add(new Field("text", text.toString(), textFieldType));
|
||||
String json = "{ \"text\" : \"" + text.toString() + "\" }";
|
||||
doc.add(new StoredField("_source", new BytesRef(json)));
|
||||
|
||||
w.addDocument(doc);
|
||||
}
|
||||
}
|
||||
|
||||
private void addFields(Document doc, List<Field> createFields) {
|
||||
for (Field field : createFields) {
|
||||
doc.add(field);
|
||||
}
|
||||
}
|
||||
|
||||
public String randomExecutionHint() {
|
||||
return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue