From 973530f953b193c797047286be3e07ca7dce17e8 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 23 Jun 2017 15:34:38 +0100 Subject: [PATCH] Added unit test coverage for SignificantTerms (#24904) Added unit test coverage for GlobalOrdinalsSignificantTermsAggregator, GlobalOrdinalsSignificantTermsAggregator.WithHash, SignificantLongTermsAggregator and SignificantStringTermsAggregator. Removed integration test. Relates #22278 --- .../bucket/SignificantTermsIT.java | 473 ------------------ .../SignificanceHeuristicTests.java | 2 +- .../SignificantTermsAggregatorTests.java | 214 ++++++++ 3 files changed, 215 insertions(+), 474 deletions(-) delete mode 100644 core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java deleted file mode 100644 index bff7471e863..00000000000 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java +++ /dev/null @@ -1,473 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.search.aggregations.bucket; - -import org.elasticsearch.action.admin.indices.refresh.RefreshRequest; -import org.elasticsearch.action.search.SearchPhaseExecutionException; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.search.SearchType; -import org.elasticsearch.cluster.metadata.IndexMetaData; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.query.TermQueryBuilder; -import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms; -import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket; -import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; -import org.elasticsearch.test.ESIntegTestCase; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Locale; -import java.util.Set; - -import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; -import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; -import static org.elasticsearch.index.query.QueryBuilders.boolQuery; -import static org.elasticsearch.index.query.QueryBuilders.termQuery; -import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms; -import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.hasSize; -import static org.hamcrest.Matchers.is; -import static org.hamcrest.core.IsNull.notNullValue; - -@ESIntegTestCase.SuiteScopeTestCase -public class SignificantTermsIT extends ESIntegTestCase { - - public String randomExecutionHint() { - return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString(); - } - - @Override - public Settings indexSettings() { - return Settings.builder() - .put("index.number_of_shards", numberOfShards()) - .put("index.number_of_replicas", numberOfReplicas()) - .build(); - } - - public static final int MUSIC_CATEGORY=1; - public static final int OTHER_CATEGORY=2; - public static final int SNOWBOARDING_CATEGORY=3; - - @Override - public void setupSuiteScopeCluster() throws Exception { - assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact", - "_routing", "required=true", "routing_id", "type=keyword", "fact_category", - "type=integer", "description", "type=text,fielddata=true")); - createIndex("idx_unmapped"); - - ensureGreen(); - String data[] = { - "A\t1\tpaul weller was lead singer of the jam before the style council", - "B\t1\tpaul weller left the jam to form the style council", - "A\t2\tpaul smith is a designer in the fashion industry", - "B\t1\tthe stranglers are a group originally from guildford", - "A\t1\tafter disbanding the style council in 1985 paul weller became a solo artist", - "B\t1\tjean jaques burnel is a bass player in the stranglers and has a black belt in karate", - "A\t1\tmalcolm owen was the lead singer of the ruts", - "B\t1\tpaul weller has denied any possibility of a reunion of the jam", - "A\t1\tformer frontman of the jam paul weller became the father of twins", - "B\t2\tex-england football star paul gascoigne has re-emerged following recent disappearance", - "A\t2\tdavid smith has recently denied connections with the mafia", - "B\t1\tthe damned's new rose single was considered the first 'punk' single in the UK", - "A\t1\tthe sex pistols broke up after a few short years together", - "B\t1\tpaul gascoigne was a midfielder for england football team", - "A\t3\tcraig kelly became the first world champion snowboarder and has a memorial at baldface lodge", - "B\t3\tterje haakonsen has credited craig kelly as his snowboard mentor", - "A\t3\tterje haakonsen and craig kelly were some of the first snowboarders sponsored by burton snowboards", - "B\t3\tlike craig kelly before him terje won the mt baker banked slalom many times - once riding switch", - "A\t3\tterje haakonsen has been a team rider for burton snowboards for over 20 years" - }; - - for (int i = 0; i < data.length; i++) { - String[] parts = data[i].split("\t"); - client().prepareIndex("test", "fact", "" + i) - .setRouting(parts[0]) - .setSource("fact_category", parts[1], "description", parts[2]).get(); - } - client().admin().indices().refresh(new RefreshRequest("test")).get(); - - assertAcked(prepareCreate("test_not_indexed") - .setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) - .addMapping("type", - "my_keyword", "type=keyword,index=false", - "my_long", "type=long,index=false")); - indexRandom(true, - client().prepareIndex("test_not_indexed", "type", "1").setSource( - "my_keyword", "foo", "my_long", 42)); - } - - public void testStructuredAnalysis() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint()) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey(); - assertTrue(topCategory.equals(Long.valueOf(SNOWBOARDING_CATEGORY))); - } - - public void testStructuredAnalysisWithIncludeExclude() throws Exception { - long[] excludeTerms = { MUSIC_CATEGORY }; - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "paul")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint()) - .minDocCount(1).includeExclude(new IncludeExclude(null, excludeTerms))) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey(); - assertTrue(topCategory.equals(Long.valueOf(OTHER_CATEGORY))); - } - - public void testIncludeExclude() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setQuery(new TermQueryBuilder("description", "weller")) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()) - .includeExclude(new IncludeExclude(null, "weller"))) - .get(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - Set terms = new HashSet<>(); - for (Bucket topTerm : topTerms) { - terms.add(topTerm.getKeyAsString()); - } - assertThat(terms, hasSize(6)); - assertThat(terms.contains("jam"), is(true)); - assertThat(terms.contains("council"), is(true)); - assertThat(terms.contains("style"), is(true)); - assertThat(terms.contains("paul"), is(true)); - assertThat(terms.contains("of"), is(true)); - assertThat(terms.contains("the"), is(true)); - - response = client().prepareSearch("test") - .setQuery(new TermQueryBuilder("description", "weller")) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()) - .includeExclude(new IncludeExclude("weller", null))) - .get(); - assertSearchResponse(response); - topTerms = response.getAggregations().get("mySignificantTerms"); - terms = new HashSet<>(); - for (Bucket topTerm : topTerms) { - terms.add(topTerm.getKeyAsString()); - } - assertThat(terms, hasSize(1)); - assertThat(terms.contains("weller"), is(true)); - } - - public void testIncludeExcludeExactValues() throws Exception { - String []incExcTerms={"weller","nosuchterm"}; - SearchResponse response = client().prepareSearch("test") - .setQuery(new TermQueryBuilder("description", "weller")) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()) - .includeExclude(new IncludeExclude(null, incExcTerms))) - .get(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - Set terms = new HashSet<>(); - for (Bucket topTerm : topTerms) { - terms.add(topTerm.getKeyAsString()); - } - assertEquals(new HashSet(Arrays.asList("jam", "council", "style", "paul", "of", "the")), terms); - - response = client().prepareSearch("test") - .setQuery(new TermQueryBuilder("description", "weller")) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()) - .includeExclude(new IncludeExclude(incExcTerms, null))) - .get(); - assertSearchResponse(response); - topTerms = response.getAggregations().get("mySignificantTerms"); - terms = new HashSet<>(); - for (Bucket topTerm : topTerms) { - terms.add(topTerm.getKeyAsString()); - } - assertThat(terms, hasSize(1)); - assertThat(terms.contains("weller"), is(true)); - } - - public void testUnmapped() throws Exception { - SearchResponse response = client().prepareSearch("idx_unmapped") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint()) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - assertThat(topTerms.getBuckets().size(), equalTo(0)); - } - - public void testTextAnalysis() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testTextAnalysisGND() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new GND(true)) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testTextAnalysisChiSquare() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new ChiSquare(false,true)) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testTextAnalysisPercentageScore() throws Exception { - SearchResponse response = client() - .prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0) - .setSize(60) - .setExplain(true) - .addAggregation( - significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()) - .significanceHeuristic(new PercentageScore()).minDocCount(2)).execute().actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testBadFilteredAnalysis() throws Exception { - // Deliberately using a bad choice of filter here for the background context in order - // to test robustness. - // We search for the name of a snowboarder but use music-related content (fact_category:1) - // as the background source of term statistics. - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("description") - .minDocCount(2).backgroundFilter(QueryBuilders.termQuery("fact_category", 1))) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - // We expect at least one of the significant terms to have been selected on the basis - // that it is present in the foreground selection but entirely missing from the filtered - // background used as context. - boolean hasMissingBackgroundTerms = false; - for (Bucket topTerm : topTerms) { - if (topTerm.getSupersetDf() == 0) { - hasMissingBackgroundTerms = true; - break; - } - } - assertTrue(hasMissingBackgroundTerms); - } - - public void testFilteredAnalysis() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "weller")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("description") - .minDocCount(1).backgroundFilter(QueryBuilders.termsQuery("description", "paul"))) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - HashSet topWords = new HashSet(); - for (Bucket topTerm : topTerms) { - topWords.add(topTerm.getKeyAsString()); - } - //The word "paul" should be a constant of all docs in the background set and therefore not seen as significant - assertFalse(topWords.contains("paul")); - //"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls. - assertTrue(topWords.contains("jam")); - } - - public void testNestedAggs() throws Exception { - String[][] expectedKeywordsByCategory={ - { "paul", "weller", "jam", "style", "council" }, - { "paul", "smith" }, - { "craig", "kelly", "terje", "haakonsen", "burton" }}; - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .addAggregation(terms("myCategories").field("fact_category").minDocCount(2) - .subAggregation( - significantTerms("mySignificantTerms").field("description") - .executionHint(randomExecutionHint()) - .minDocCount(2))) - .execute() - .actionGet(); - assertSearchResponse(response); - Terms topCategoryTerms = response.getAggregations().get("myCategories"); - for (org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket topCategory : topCategoryTerms.getBuckets()) { - SignificantTerms topTerms = topCategory.getAggregations().get("mySignificantTerms"); - HashSet foundTopWords = new HashSet(); - for (Bucket topTerm : topTerms) { - foundTopWords.add(topTerm.getKeyAsString()); - } - String[] expectedKeywords = expectedKeywordsByCategory[Integer.parseInt(topCategory.getKeyAsString()) - 1]; - for (String expectedKeyword : expectedKeywords) { - assertTrue(expectedKeyword + " missing from category keywords", foundTopWords.contains(expectedKeyword)); - } - } - } - - public void testPartiallyUnmapped() throws Exception { - SearchResponse response = client().prepareSearch("idx_unmapped", "test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms").field("description") - .executionHint(randomExecutionHint()) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testPartiallyUnmappedWithFormat() throws Exception { - SearchResponse response = client().prepareSearch("idx_unmapped", "test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(boolQuery().should(termQuery("description", "the")).should(termQuery("description", "terje"))) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms") - .field("fact_category") - .executionHint(randomExecutionHint()) - .minDocCount(1) - .format("0000")) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - for (int i = 1; i <= 3; i++) { - String key = String.format(Locale.ROOT, "%04d", i); - SignificantTerms.Bucket bucket = topTerms.getBucketByKey(key); - assertThat(bucket, notNullValue()); - assertThat(bucket.getKeyAsString(), equalTo(key)); - } - } - - private void checkExpectedStringTermsFound(SignificantTerms topTerms) { - HashMaptopWords=new HashMap<>(); - for (Bucket topTerm : topTerms ){ - topWords.put(topTerm.getKeyAsString(), topTerm); - } - assertTrue( topWords.containsKey("haakonsen")); - assertTrue( topWords.containsKey("craig")); - assertTrue( topWords.containsKey("kelly")); - assertTrue( topWords.containsKey("burton")); - assertTrue( topWords.containsKey("snowboards")); - Bucket kellyTerm=topWords.get("kelly"); - assertEquals(3, kellyTerm.getSubsetDf()); - assertEquals(4, kellyTerm.getSupersetDf()); - } - - public void testDefaultSignificanceHeuristic() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms") - .field("description") - .executionHint(randomExecutionHint()) - .significanceHeuristic(new JLHScore()) - .minDocCount(2)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testMutualInformation() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_THEN_FETCH) - .setQuery(new TermQueryBuilder("description", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(significantTerms("mySignificantTerms") - .field("description") - .executionHint(randomExecutionHint()) - .significanceHeuristic(new MutualInformation(false, true)) - .minDocCount(1)) - .execute() - .actionGet(); - assertSearchResponse(response); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - checkExpectedStringTermsFound(topTerms); - } - - public void testFailIfFieldNotIndexed() { - SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class, - () -> client().prepareSearch("test_not_indexed").addAggregation( - significantTerms("mySignificantTerms").field("my_keyword")).get()); - assertThat(e.toString(), - containsString("Cannot search on field [my_keyword] since it is not indexed.")); - - e = expectThrows(SearchPhaseExecutionException.class, - () -> client().prepareSearch("test_not_indexed").addAggregation( - significantTerms("mySignificantTerms").field("my_long")).get()); - assertThat(e.toString(), - containsString("Cannot search on field [my_long] since it is not indexed.")); - } -} diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java index 2dc208d89fb..9c6615f8ff9 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java @@ -135,7 +135,7 @@ public class SignificanceHeuristicTests extends ESTestCase { } } - SignificanceHeuristic getRandomSignificanceheuristic() { + public static SignificanceHeuristic getRandomSignificanceheuristic() { List heuristics = new ArrayList<>(); heuristics.add(new JLHScore()); heuristics.add(new MutualInformation(randomBoolean(), randomBoolean())); diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java index e2625039df5..537af74bda1 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java @@ -19,23 +19,43 @@ package org.elasticsearch.search.aggregations.bucket.significant; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.index.mapper.NumberFieldMapper.NumberFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper.NumberType; +import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.aggregations.AggregatorFactory; import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode; +import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; import org.elasticsearch.search.aggregations.support.ValueType; import org.hamcrest.Matchers; import org.junit.Before; import java.io.IOException; +import java.util.List; public class SignificantTermsAggregatorTests extends AggregatorTestCase { @@ -71,5 +91,199 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase { // be 0 assertEquals(1, ((BooleanQuery) parsedQuery).getMinimumNumberShouldMatch()); } + + /** + * Uses the significant terms aggregation to find the keywords in text fields + */ + public void testSignificance() throws IOException { + TextFieldType textFieldType = new TextFieldType(); + textFieldType.setName("text"); + textFieldType.setFielddata(true); + textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer())); + + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + addMixedTextDocs(textFieldType, w); + + SignificantTermsAggregationBuilder sigAgg = new SignificantTermsAggregationBuilder("sig_text", null).field("text"); + sigAgg.executionHint(randomExecutionHint()); + if (randomBoolean()) { + // Use a background filter which just happens to be same scope as whole-index. + sigAgg.backgroundFilter(QueryBuilders.termsQuery("text", "common")); + } + + SignificantTermsAggregationBuilder sigNumAgg = new SignificantTermsAggregationBuilder("sig_number", null).field("long_field"); + sigNumAgg.executionHint(randomExecutionHint()); + + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + + // Search "odd" + SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType); + + assertEquals(1, terms.getBuckets().size()); + assertNull(terms.getBucketByKey("even")); + assertNull(terms.getBucketByKey("common")); + assertNotNull(terms.getBucketByKey("odd")); + + // Search even + terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigAgg, textFieldType); + + assertEquals(1, terms.getBuckets().size()); + assertNull(terms.getBucketByKey("odd")); + assertNull(terms.getBucketByKey("common")); + assertNotNull(terms.getBucketByKey("even")); + + // Search odd with regex includeexcludes + sigAgg.includeExclude(new IncludeExclude("o.d", null)); + terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType); + assertEquals(1, terms.getBuckets().size()); + assertNotNull(terms.getBucketByKey("odd")); + assertNull(terms.getBucketByKey("common")); + assertNull(terms.getBucketByKey("even")); + + // Search with string-based includeexcludes + String oddStrings[] = new String[] {"odd", "weird"}; + String evenStrings[] = new String[] {"even", "regular"}; + + sigAgg.includeExclude(new IncludeExclude(oddStrings, evenStrings)); + sigAgg.significanceHeuristic(SignificanceHeuristicTests.getRandomSignificanceheuristic()); + terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType); + assertEquals(1, terms.getBuckets().size()); + assertNotNull(terms.getBucketByKey("odd")); + assertNull(terms.getBucketByKey("weird")); + assertNull(terms.getBucketByKey("common")); + assertNull(terms.getBucketByKey("even")); + assertNull(terms.getBucketByKey("regular")); + + sigAgg.includeExclude(new IncludeExclude(evenStrings, oddStrings)); + terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType); + assertEquals(0, terms.getBuckets().size()); + assertNull(terms.getBucketByKey("odd")); + assertNull(terms.getBucketByKey("weird")); + assertNull(terms.getBucketByKey("common")); + assertNull(terms.getBucketByKey("even")); + assertNull(terms.getBucketByKey("regular")); + + } + } + } + + /** + * Uses the significant terms aggregation to find the keywords in numeric + * fields + */ + public void testNumericSignificance() throws IOException { + NumberFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + longFieldType.setName("long_field"); + + TextFieldType textFieldType = new TextFieldType(); + textFieldType.setName("text"); + textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer())); + + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + final long ODD_VALUE = 3; + final long EVEN_VALUE = 6; + final long COMMON_VALUE = 2; + + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + + for (int i = 0; i < 10; i++) { + Document doc = new Document(); + if (i % 2 == 0) { + addFields(doc, NumberType.LONG.createFields("long_field", ODD_VALUE, true, true, false)); + doc.add(new Field("text", "odd", textFieldType)); + } else { + addFields(doc, NumberType.LONG.createFields("long_field", EVEN_VALUE, true, true, false)); + doc.add(new Field("text", "even", textFieldType)); + } + addFields(doc, NumberType.LONG.createFields("long_field", COMMON_VALUE, true, true, false)); + w.addDocument(doc); + } + + SignificantTermsAggregationBuilder sigNumAgg = new SignificantTermsAggregationBuilder("sig_number", null).field("long_field"); + sigNumAgg.executionHint(randomExecutionHint()); + + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + + // Search "odd" + SignificantLongTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigNumAgg, longFieldType); + assertEquals(1, terms.getBuckets().size()); + + assertNull(terms.getBucketByKey(Long.toString(EVEN_VALUE))); + assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE))); + assertNotNull(terms.getBucketByKey(Long.toString(ODD_VALUE))); + + terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigNumAgg, longFieldType); + assertEquals(1, terms.getBuckets().size()); + + assertNull(terms.getBucketByKey(Long.toString(ODD_VALUE))); + assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE))); + assertNotNull(terms.getBucketByKey(Long.toString(EVEN_VALUE))); + + } + } + } + + /** + * Uses the significant terms aggregation on an index with unmapped field + */ + public void testUnmapped() throws IOException { + TextFieldType textFieldType = new TextFieldType(); + textFieldType.setName("text"); + textFieldType.setFielddata(true); + textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer())); + + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + addMixedTextDocs(textFieldType, w); + + // Attempt aggregation on unmapped field + SignificantTermsAggregationBuilder sigAgg = new SignificantTermsAggregationBuilder("sig_text", null).field("unmapped_field"); + sigAgg.executionHint(randomExecutionHint()); + + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + + // Search "odd" + SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType); + assertEquals(0, terms.getBuckets().size()); + + assertNull(terms.getBucketByKey("even")); + assertNull(terms.getBucketByKey("common")); + assertNull(terms.getBucketByKey("odd")); + + } + } + } + + private void addMixedTextDocs(TextFieldType textFieldType, IndexWriter w) throws IOException { + for (int i = 0; i < 10; i++) { + Document doc = new Document(); + StringBuilder text = new StringBuilder("common "); + if (i % 2 == 0) { + text.append("odd "); + } else { + text.append("even "); + } + + doc.add(new Field("text", text.toString(), textFieldType)); + String json = "{ \"text\" : \"" + text.toString() + "\" }"; + doc.add(new StoredField("_source", new BytesRef(json))); + + w.addDocument(doc); + } + } + + private void addFields(Document doc, List createFields) { + for (Field field : createFields) { + doc.add(field); + } + } + + public String randomExecutionHint() { + return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString(); + } }