From 428e70758ac6895ac995f4315412f4d3729aea9b Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Wed, 14 Jun 2017 01:26:36 +0200 Subject: [PATCH] Moved more token filters to analysis-common module. The following token filters were moved: `edge_ngram`, `ngram`, `uppercase`, `lowercase`, `length`, `flatten_graph` and `unique`. Relates to #23658 --- .../resources/checkstyle_suppressions.xml | 1 - .../indices/analysis/AnalysisModule.java | 16 -- .../highlight/HighlighterSearchIT.java | 49 ----- .../search/query/QueryStringIT.java | 30 ++- .../search/query/SimpleQueryStringIT.java | 10 +- .../query/all-query-index-with-all.json | 19 +- .../search/query/all-query-index.json | 20 +- .../analysis/common/CommonAnalysisPlugin.java | 14 +- .../common}/EdgeNGramTokenFilterFactory.java | 17 +- .../FlattenGraphTokenFilterFactory.java | 5 +- .../common}/LengthTokenFilterFactory.java | 7 +- .../common}/LowerCaseTokenFilterFactory.java | 6 +- .../common}/NGramTokenFilterFactory.java | 7 +- .../analysis/common}/UniqueTokenFilter.java | 8 +- .../common}/UniqueTokenFilterFactory.java | 6 +- .../common}/UpperCaseTokenFilterFactory.java | 4 +- .../common/CommonAnalysisFactoryTests.java | 23 ++- .../FlattenGraphTokenFilterFactoryTests.java | 6 +- .../common}/NGramTokenizerFactoryTests.java | 38 ++-- .../common}/UniqueTokenFilterTests.java | 2 +- .../test/analysis-common/40_token_filters.yml | 182 ++++++++++++++++++ .../test/search.query/20_ngram_search.yml | 41 ++++ .../search.query/30_ngram_highligthing.yml | 129 +++++++++++++ .../analysis/AnalysisFactoryTestCase.java | 26 +-- 24 files changed, 470 insertions(+), 196 deletions(-) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/EdgeNGramTokenFilterFactory.java (92%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/FlattenGraphTokenFilterFactory.java (84%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/LengthTokenFilterFactory.java (88%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/LowerCaseTokenFilterFactory.java (89%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/NGramTokenFilterFactory.java (87%) rename {core/src/main/java/org/apache/lucene/analysis/miscellaneous => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/UniqueTokenFilter.java (92%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/UniqueTokenFilterFactory.java (86%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/UpperCaseTokenFilterFactory.java (89%) rename {core/src/test/java/org/elasticsearch/index/analysis => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/FlattenGraphTokenFilterFactoryTests.java (98%) rename {core/src/test/java/org/elasticsearch/index/analysis => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/NGramTokenizerFactoryTests.java (85%) rename {core/src/test/java/org/apache/lucene/analysis/miscellaneous => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/UniqueTokenFilterTests.java (97%) create mode 100644 modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/20_ngram_search.yml create mode 100644 modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml index 678155c6561..caa4d6dec38 100644 --- a/buildSrc/src/main/resources/checkstyle_suppressions.xml +++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml @@ -571,7 +571,6 @@ - diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 3f26b722f41..9220c063715 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -54,14 +54,12 @@ import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.DutchAnalyzerProvider; import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory; -import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; import org.elasticsearch.index.analysis.ElisionTokenFilterFactory; import org.elasticsearch.index.analysis.EnglishAnalyzerProvider; import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider; import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory; import org.elasticsearch.index.analysis.FinnishAnalyzerProvider; -import org.elasticsearch.index.analysis.FlattenGraphTokenFilterFactory; import org.elasticsearch.index.analysis.FrenchAnalyzerProvider; import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory; import org.elasticsearch.index.analysis.GalicianAnalyzerProvider; @@ -83,14 +81,11 @@ import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordAnalyzerProvider; import org.elasticsearch.index.analysis.KeywordTokenizerFactory; import org.elasticsearch.index.analysis.LatvianAnalyzerProvider; -import org.elasticsearch.index.analysis.LengthTokenFilterFactory; import org.elasticsearch.index.analysis.LetterTokenizerFactory; import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider; -import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory; import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory; import org.elasticsearch.index.analysis.MinHashTokenFilterFactory; -import org.elasticsearch.index.analysis.NGramTokenFilterFactory; import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider; import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; @@ -133,8 +128,6 @@ import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.analysis.TruncateTokenFilterFactory; import org.elasticsearch.index.analysis.TurkishAnalyzerProvider; import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; -import org.elasticsearch.index.analysis.UniqueTokenFilterFactory; -import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; @@ -209,25 +202,16 @@ public final class AnalysisModule { NamedRegistry> tokenFilters = new NamedRegistry<>("token_filter"); tokenFilters.register("stop", StopTokenFilterFactory::new); tokenFilters.register("reverse", ReverseTokenFilterFactory::new); - tokenFilters.register("length", LengthTokenFilterFactory::new); - tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new); - tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new); tokenFilters.register("kstem", KStemTokenFilterFactory::new); tokenFilters.register("standard", StandardTokenFilterFactory::new); - tokenFilters.register("nGram", NGramTokenFilterFactory::new); - tokenFilters.register("ngram", NGramTokenFilterFactory::new); - tokenFilters.register("edgeNGram", EdgeNGramTokenFilterFactory::new); - tokenFilters.register("edge_ngram", EdgeNGramTokenFilterFactory::new); tokenFilters.register("shingle", ShingleTokenFilterFactory::new); tokenFilters.register("min_hash", MinHashTokenFilterFactory::new); - tokenFilters.register("unique", UniqueTokenFilterFactory::new); tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new)); tokenFilters.register("limit", LimitTokenCountFilterFactory::new); tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); tokenFilters.register("elision", ElisionTokenFilterFactory::new); - tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new); tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new)); tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new)); tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 9cbd9fc5d75..2bc98b39dc2 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -19,7 +19,6 @@ package org.elasticsearch.search.fetch.subphase.highlight; import com.carrotsearch.randomizedtesting.generators.RandomPicks; - import org.apache.lucene.search.join.ScoreMode; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.action.search.SearchRequestBuilder; @@ -214,54 +213,6 @@ public class HighlighterSearchIT extends ESIntegTestCase { assertHighlight(search, 0, "name", 0, startsWith("abc abc abc abc")); } - public void testNgramHighlighting() throws IOException { - assertAcked(prepareCreate("test") - .addMapping("test", - "name", "type=text,analyzer=name_index_analyzer,search_analyzer=name_search_analyzer," - + "term_vector=with_positions_offsets", - "name2", "type=text,analyzer=name2_index_analyzer,search_analyzer=name_search_analyzer," - + "term_vector=with_positions_offsets") - .setSettings(Settings.builder() - .put(indexSettings()) - .put("analysis.filter.my_ngram.max_gram", 20) - .put("analysis.filter.my_ngram.min_gram", 1) - .put("analysis.filter.my_ngram.type", "ngram") - .put("analysis.tokenizer.my_ngramt.max_gram", 20) - .put("analysis.tokenizer.my_ngramt.min_gram", 1) - .put("analysis.tokenizer.my_ngramt.token_chars", "letter,digit") - .put("analysis.tokenizer.my_ngramt.type", "ngram") - .put("analysis.analyzer.name_index_analyzer.tokenizer", "my_ngramt") - .put("analysis.analyzer.name2_index_analyzer.tokenizer", "whitespace") - .put("analysis.analyzer.name2_index_analyzer.filter", "my_ngram") - .put("analysis.analyzer.name_search_analyzer.tokenizer", "whitespace"))); - client().prepareIndex("test", "test", "1") - .setSource("name", "logicacmg ehemals avinci - the know how company", - "name2", "logicacmg ehemals avinci - the know how company").get(); - refresh(); - ensureGreen(); - SearchResponse search = client().prepareSearch().setQuery(matchQuery("name", "logica m")) - .highlighter(new HighlightBuilder().field("name")).get(); - assertHighlight(search, 0, "name", 0, - equalTo("logicacmg ehemals avinci - the know how company")); - - search = client().prepareSearch().setQuery(matchQuery("name", "logica ma")).highlighter(new HighlightBuilder().field("name")).get(); - assertHighlight(search, 0, "name", 0, equalTo("logicacmg ehemals avinci - the know how company")); - - search = client().prepareSearch().setQuery(matchQuery("name", "logica")).highlighter(new HighlightBuilder().field("name")).get(); - assertHighlight(search, 0, "name", 0, equalTo("logicacmg ehemals avinci - the know how company")); - - search = client().prepareSearch().setQuery(matchQuery("name2", "logica m")).highlighter(new HighlightBuilder().field("name2")) - .get(); - assertHighlight(search, 0, "name2", 0, equalTo("logicacmg ehemals avinci - the know how company")); - - search = client().prepareSearch().setQuery(matchQuery("name2", "logica ma")).highlighter(new HighlightBuilder().field("name2")) - .get(); - assertHighlight(search, 0, "name2", 0, equalTo("logicacmg ehemals avinci - the know how company")); - - search = client().prepareSearch().setQuery(matchQuery("name2", "logica")).highlighter(new HighlightBuilder().field("name2")).get(); - assertHighlight(search, 0, "name2", 0, equalTo("logicacmg ehemals avinci - the know how company")); - } - public void testEnsureNoNegativeOffsets() throws Exception { assertAcked(prepareCreate("test") .addMapping("type1", diff --git a/core/src/test/java/org/elasticsearch/search/query/QueryStringIT.java b/core/src/test/java/org/elasticsearch/search/query/QueryStringIT.java index 05a72276362..bd8cfbcaa5a 100644 --- a/core/src/test/java/org/elasticsearch/search/query/QueryStringIT.java +++ b/core/src/test/java/org/elasticsearch/search/query/QueryStringIT.java @@ -19,16 +19,6 @@ package org.elasticsearch.search.query; -import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery; -import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoSearchHits; -import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits; -import static org.hamcrest.Matchers.containsInAnyOrder; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.equalTo; - import org.apache.lucene.util.LuceneTestCase; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder; @@ -56,6 +46,16 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery; +import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoSearchHits; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits; +import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + public class QueryStringIT extends ESIntegTestCase { @Override protected Collection> nodePlugins() { @@ -91,10 +91,6 @@ public class QueryStringIT extends ESIntegTestCase { resp = client().prepareSearch("test").setQuery(queryStringQuery("Bar")).get(); assertHitCount(resp, 3L); assertHits(resp.getHits(), "1", "2", "3"); - - resp = client().prepareSearch("test").setQuery(queryStringQuery("foa")).get(); - assertHitCount(resp, 1L); - assertHits(resp.getHits(), "3"); } public void testWithDate() throws Exception { @@ -161,8 +157,6 @@ public class QueryStringIT extends ESIntegTestCase { assertHits(resp.getHits(), "1"); resp = client().prepareSearch("test").setQuery(queryStringQuery("Baz")).get(); assertHits(resp.getHits(), "1"); - resp = client().prepareSearch("test").setQuery(queryStringQuery("sbaz")).get(); - assertHits(resp.getHits(), "1"); resp = client().prepareSearch("test").setQuery(queryStringQuery("19")).get(); assertHits(resp.getHits(), "1"); // nested doesn't match because it's hidden @@ -223,11 +217,11 @@ public class QueryStringIT extends ESIntegTestCase { indexRandom(true, false, reqs); SearchResponse resp = client().prepareSearch("test2").setQuery( - queryStringQuery("foo eggplent").defaultOperator(Operator.AND)).get(); + queryStringQuery("foo eggplant").defaultOperator(Operator.AND)).get(); assertHitCount(resp, 0L); resp = client().prepareSearch("test2").setQuery( - queryStringQuery("foo eggplent").defaultOperator(Operator.AND).useAllFields(true)).get(); + queryStringQuery("foo eggplant").defaultOperator(Operator.AND).useAllFields(true)).get(); assertHits(resp.getHits(), "1"); assertHitCount(resp, 1L); diff --git a/core/src/test/java/org/elasticsearch/search/query/SimpleQueryStringIT.java b/core/src/test/java/org/elasticsearch/search/query/SimpleQueryStringIT.java index f22ec392b99..a32a8060379 100644 --- a/core/src/test/java/org/elasticsearch/search/query/SimpleQueryStringIT.java +++ b/core/src/test/java/org/elasticsearch/search/query/SimpleQueryStringIT.java @@ -398,10 +398,6 @@ public class SimpleQueryStringIT extends ESIntegTestCase { resp = client().prepareSearch("test").setQuery(simpleQueryStringQuery("Bar")).get(); assertHitCount(resp, 3L); assertHits(resp.getHits(), "1", "2", "3"); - - resp = client().prepareSearch("test").setQuery(simpleQueryStringQuery("foa")).get(); - assertHitCount(resp, 1L); - assertHits(resp.getHits(), "3"); } public void testWithDate() throws Exception { @@ -480,8 +476,6 @@ public class SimpleQueryStringIT extends ESIntegTestCase { assertHits(resp.getHits(), "1"); resp = client().prepareSearch("test").setQuery(simpleQueryStringQuery("Baz")).get(); assertHits(resp.getHits(), "1"); - resp = client().prepareSearch("test").setQuery(simpleQueryStringQuery("sbaz")).get(); - assertHits(resp.getHits(), "1"); resp = client().prepareSearch("test").setQuery(simpleQueryStringQuery("19")).get(); assertHits(resp.getHits(), "1"); // nested doesn't match because it's hidden @@ -547,11 +541,11 @@ public class SimpleQueryStringIT extends ESIntegTestCase { indexRandom(true, false, reqs); SearchResponse resp = client().prepareSearch("test").setQuery( - simpleQueryStringQuery("foo eggplent").defaultOperator(Operator.AND)).get(); + simpleQueryStringQuery("foo eggplant").defaultOperator(Operator.AND)).get(); assertHitCount(resp, 0L); resp = client().prepareSearch("test").setQuery( - simpleQueryStringQuery("foo eggplent").defaultOperator(Operator.AND).useAllFields(true)).get(); + simpleQueryStringQuery("foo eggplant").defaultOperator(Operator.AND).useAllFields(true)).get(); assertHits(resp.getHits(), "1"); assertHitCount(resp, 1L); diff --git a/core/src/test/resources/org/elasticsearch/search/query/all-query-index-with-all.json b/core/src/test/resources/org/elasticsearch/search/query/all-query-index-with-all.json index 1a96fd71333..d9cbb485d13 100644 --- a/core/src/test/resources/org/elasticsearch/search/query/all-query-index-with-all.json +++ b/core/src/test/resources/org/elasticsearch/search/query/all-query-index-with-all.json @@ -6,22 +6,7 @@ "version": { "created": "5000099" }, - "analysis": { - "analyzer": { - "my_ngrams": { - "type": "custom", - "tokenizer": "standard", - "filter": ["my_ngrams"] - } - }, - "filter": { - "my_ngrams": { - "type": "ngram", - "min_gram": 2, - "max_gram": 2 - } - } - } + "query.default_field": "f1" } }, "mappings": { @@ -31,7 +16,7 @@ }, "properties": { "f1": {"type": "text"}, - "f2": {"type": "text", "analyzer": "my_ngrams"} + "f2": {"type": "text"} } } } diff --git a/core/src/test/resources/org/elasticsearch/search/query/all-query-index.json b/core/src/test/resources/org/elasticsearch/search/query/all-query-index.json index 86dde5aaf88..89c41217125 100644 --- a/core/src/test/resources/org/elasticsearch/search/query/all-query-index.json +++ b/core/src/test/resources/org/elasticsearch/search/query/all-query-index.json @@ -2,23 +2,7 @@ "settings": { "index": { "number_of_shards": 1, - "number_of_replicas": 0, - "analysis": { - "analyzer": { - "my_ngrams": { - "type": "custom", - "tokenizer": "standard", - "filter": ["my_ngrams"] - } - }, - "filter": { - "my_ngrams": { - "type": "ngram", - "min_gram": 2, - "max_gram": 2 - } - } - } + "number_of_replicas": 0 } }, "mappings": { @@ -26,7 +10,7 @@ "properties": { "f1": {"type": "text"}, "f2": {"type": "keyword"}, - "f3": {"type": "text", "analyzer": "my_ngrams"}, + "f3": {"type": "text"}, "f4": { "type": "text", "index_options": "docs" diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 2f8f1d7405a..6cf78044569 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -52,7 +52,6 @@ import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; -import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; @@ -98,6 +97,15 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { filters.put("trim", TrimTokenFilterFactory::new); filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); + filters.put("unique", UniqueTokenFilterFactory::new); + filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new); + filters.put("length", LengthTokenFilterFactory::new); + filters.put("lowercase", LowerCaseTokenFilterFactory::new); + filters.put("uppercase", UpperCaseTokenFilterFactory::new); + filters.put("nGram", NGramTokenFilterFactory::new); + filters.put("ngram", NGramTokenFilterFactory::new); + filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new); + filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); return filters; } @@ -172,7 +180,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); @@ -185,7 +193,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, diff --git a/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java similarity index 92% rename from core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java index 1d3b8e296ec..af6d30a0354 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { @@ -38,13 +39,13 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { public static final int SIDE_BACK = 2; private final int side; - public EdgeNGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + EdgeNGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); this.side = parseSide(settings.get("side", "front")); } - + static int parseSide(String side) { switch(side) { case "front": return SIDE_FRONT; @@ -56,19 +57,19 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { TokenStream result = tokenStream; - + // side=BACK is not supported anymore but applying ReverseStringFilter up-front and after the token filter has the same effect if (side == SIDE_BACK) { result = new ReverseStringFilter(result); } - + result = new EdgeNGramTokenFilter(result, minGram, maxGram); - + // side=BACK is not supported anymore but applying ReverseStringFilter up-front and after the token filter has the same effect if (side == SIDE_BACK) { result = new ReverseStringFilter(result); } - + return result; } @@ -76,4 +77,4 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { public boolean breaksFastVectorHighlighter() { return true; } -} \ No newline at end of file +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FlattenGraphTokenFilterFactory.java similarity index 84% rename from core/src/main/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FlattenGraphTokenFilterFactory.java index 6c9487a2cb3..e59c23e4a6c 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FlattenGraphTokenFilterFactory.java @@ -17,17 +17,18 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.FlattenGraphFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class FlattenGraphTokenFilterFactory extends AbstractTokenFilterFactory { - public FlattenGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + FlattenGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LengthTokenFilterFactory.java similarity index 88% rename from core/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LengthTokenFilterFactory.java index 8a03802a7dd..477886d702b 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LengthTokenFilterFactory.java @@ -17,23 +17,24 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class LengthTokenFilterFactory extends AbstractTokenFilterFactory { private final int min; private final int max; - + // ancient unsupported option private static final String ENABLE_POS_INC_KEY = "enable_position_increments"; - public LengthTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + LengthTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); min = settings.getAsInt("min", 0); max = settings.getAsInt("max", Integer.MAX_VALUE); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LowerCaseTokenFilterFactory.java similarity index 89% rename from core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LowerCaseTokenFilterFactory.java index 1d9ca2272b8..f85db0dae68 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LowerCaseTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; @@ -27,6 +27,8 @@ import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link LowerCaseFilter} and some language-specific variants @@ -41,7 +43,7 @@ public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory impl private final String lang; - public LowerCaseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + LowerCaseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.lang = settings.get("language", null); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java similarity index 87% rename from core/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java index 7926f585bc3..2d7a8c52fd6 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { @@ -33,7 +34,7 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { private final int maxGram; - public NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); @@ -43,4 +44,4 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return new NGramTokenFilter(tokenStream, minGram, maxGram); } -} \ No newline at end of file +} diff --git a/core/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UniqueTokenFilter.java similarity index 92% rename from core/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UniqueTokenFilter.java index cc853932efc..ae2b03f5329 100644 --- a/core/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UniqueTokenFilter.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.lucene.analysis.miscellaneous; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; @@ -31,7 +31,7 @@ import java.io.IOException; * A token filter that generates unique tokens. Can remove unique tokens only on the same * position increments as well. */ -public class UniqueTokenFilter extends TokenFilter { +class UniqueTokenFilter extends TokenFilter { private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); @@ -39,11 +39,11 @@ public class UniqueTokenFilter extends TokenFilter { private final CharArraySet previous = new CharArraySet(8, false); private final boolean onlyOnSamePosition; - public UniqueTokenFilter(TokenStream in) { + UniqueTokenFilter(TokenStream in) { this(in, false); } - public UniqueTokenFilter(TokenStream in, boolean onlyOnSamePosition) { + UniqueTokenFilter(TokenStream in, boolean onlyOnSamePosition) { super(in); this.onlyOnSamePosition = onlyOnSamePosition; } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UniqueTokenFilterFactory.java similarity index 86% rename from core/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UniqueTokenFilterFactory.java index 8606a60292c..256e3dad5c0 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UniqueTokenFilterFactory.java @@ -17,19 +17,19 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class UniqueTokenFilterFactory extends AbstractTokenFilterFactory { private final boolean onlyOnSamePosition; - public UniqueTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + UniqueTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.onlyOnSamePosition = settings.getAsBooleanLenientForPreEs6Indices( indexSettings.getIndexVersionCreated(), "only_on_same_position", false, deprecationLogger); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UpperCaseTokenFilterFactory.java similarity index 89% rename from core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UpperCaseTokenFilterFactory.java index 551345fc2e1..7923026d3da 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UpperCaseTokenFilterFactory.java @@ -17,13 +17,15 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.UpperCaseFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index f7313572e13..f7c2a411fe1 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -51,13 +51,22 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { @Override protected Map> getTokenFilters() { Map> filters = new TreeMap<>(super.getTokenFilters()); - filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class); - filters.put("keywordmarker", KeywordMarkerTokenFilterFactory.class); - filters.put("porterstem", PorterStemTokenFilterFactory.class); - filters.put("snowballporter", SnowballTokenFilterFactory.class); - filters.put("trim", TrimTokenFilterFactory.class); - filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class); - filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class); + filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class); + filters.put("keywordmarker", KeywordMarkerTokenFilterFactory.class); + filters.put("porterstem", PorterStemTokenFilterFactory.class); + filters.put("snowballporter", SnowballTokenFilterFactory.class); + filters.put("trim", TrimTokenFilterFactory.class); + filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class); + filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class); + filters.put("flattengraph", FlattenGraphTokenFilterFactory.class); + filters.put("length", LengthTokenFilterFactory.class); + filters.put("greeklowercase", LowerCaseTokenFilterFactory.class); + filters.put("irishlowercase", LowerCaseTokenFilterFactory.class); + filters.put("lowercase", LowerCaseTokenFilterFactory.class); + filters.put("turkishlowercase", LowerCaseTokenFilterFactory.class); + filters.put("uppercase", UpperCaseTokenFilterFactory.class); + filters.put("ngram", NGramTokenFilterFactory.class); + filters.put("edgengram", EdgeNGramTokenFilterFactory.class); return filters; } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/FlattenGraphTokenFilterFactoryTests.java similarity index 98% rename from core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/FlattenGraphTokenFilterFactoryTests.java index 259da010daa..fec7f73a697 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/FlattenGraphTokenFilterFactoryTests.java @@ -17,9 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; - -import java.io.IOException; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.Token; @@ -30,6 +28,8 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.test.ESTokenStreamTestCase; import org.elasticsearch.test.IndexSettingsModule; +import java.io.IOException; + public class FlattenGraphTokenFilterFactoryTests extends ESTokenStreamTestCase { public void testBasic() throws IOException { diff --git a/core/src/test/java/org/elasticsearch/index/analysis/NGramTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java similarity index 85% rename from core/src/test/java/org/elasticsearch/index/analysis/NGramTokenizerFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java index 5e1cf2e8179..24efd89b7e0 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/NGramTokenizerFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; @@ -30,6 +30,8 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings.Builder; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; +import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.test.ESTokenStreamTestCase; import org.elasticsearch.test.IndexSettingsModule; @@ -52,7 +54,8 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { final Settings indexSettings = newAnalysisSettingsBuilder().build(); IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) { - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build(); + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .put("token_chars", tokenChars).build(); try { new NGramTokenizerFactory(indexProperties, null, name, settings).create(); fail(); @@ -61,7 +64,8 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { } } for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) { - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build(); + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .put("token_chars", tokenChars).build(); indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); new NGramTokenizerFactory(indexProperties, null, name, settings).create(); @@ -73,8 +77,10 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { final Index index = new Index("test", "_na_"); final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4).putArray("token_chars", new String[0]).build(); - Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4) + .putArray("token_chars", new String[0]).build(); + Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) + .create(); tokenizer.setReader(new StringReader("1.34")); assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"}); } @@ -84,12 +90,15 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { final Index index = new Index("test", "_na_"); final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); - Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build(); - Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); + Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .put("token_chars", "letter,digit").build(); + Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) + .create(); tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f ")); assertTokenStreamContents(tokenizer, new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"}); - settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); + settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader(" a!$ 9")); assertTokenStreamContents(tokenizer, @@ -102,12 +111,15 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { final String name = "ngr"; final Settings indexSettings = newAnalysisSettingsBuilder().build(); Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build(); - Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); + Tokenizer tokenizer = + new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f ")); assertTokenStreamContents(tokenizer, new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"}); - settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); - tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); + settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3) + .put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); + tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) + .create(); tokenizer.setReader(new StringReader(" a!$ 9")); assertTokenStreamContents(tokenizer, new String[] {" a", " a!"}); @@ -128,7 +140,9 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("foo bar")); - TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer); + TokenStream edgeNGramTokenFilter = + new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) + .create(tokenizer); if (reverse) { assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class)); } else { diff --git a/core/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/UniqueTokenFilterTests.java similarity index 97% rename from core/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/UniqueTokenFilterTests.java index 324e422531b..f75822a13c4 100644 --- a/core/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/UniqueTokenFilterTests.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.lucene.analysis.miscellaneous; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenizer; diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index eb9dec65542..1d3075e28f8 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -210,3 +210,185 @@ - match: { detail.tokenfilters.0.tokens.5.start_offset: 16 } - match: { detail.tokenfilters.0.tokens.5.end_offset: 19 } - match: { detail.tokenfilters.0.tokens.5.position: 5 } + +--- +"unique": + - do: + indices.analyze: + body: + text: Foo Foo Bar! + tokenizer: whitespace + filter: [unique] + - length: { tokens: 2 } + - match: { tokens.0.token: Foo } + - match: { tokens.1.token: Bar! } + +--- +"synonym_graph and flatten_graph": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_synonym_graph: + type: synonym_graph + synonyms: ["automatic teller machine,atm,cash point"] + + - do: + indices.analyze: + index: test + body: + text: this automatic teller machine is down + tokenizer: whitespace + filter: [my_synonym_graph] + - length: { tokens: 9 } + - match: { tokens.0.token: this } + - match: { tokens.0.position: 0 } + - is_false: tokens.0.positionLength + - match: { tokens.1.token: atm } + - match: { tokens.1.position: 1 } + - match: { tokens.1.positionLength: 4 } + - match: { tokens.2.token: cash } + - match: { tokens.2.position: 1 } + - is_false: tokens.2.positionLength + - match: { tokens.3.token: automatic } + - match: { tokens.3.position: 1 } + - match: { tokens.3.positionLength: 2 } + - match: { tokens.4.token: point } + - match: { tokens.4.position: 2 } + - match: { tokens.4.positionLength: 3 } + - match: { tokens.5.token: teller } + - match: { tokens.5.position: 3 } + - is_false: tokens.5.positionLength + - match: { tokens.6.token: machine } + - match: { tokens.6.position: 4 } + - is_false: tokens.6.positionLength + - match: { tokens.7.token: is } + - match: { tokens.7.position: 5 } + - is_false: tokens.7.positionLength + - match: { tokens.8.token: down } + - match: { tokens.8.position: 6 } + - is_false: tokens.8.positionLength + + - do: + indices.analyze: + index: test + body: + text: this automatic teller machine is down + tokenizer: whitespace + filter: [my_synonym_graph,flatten_graph] + - length: { tokens: 9 } + - match: { tokens.0.token: this } + - match: { tokens.0.position: 0 } + - is_false: tokens.0.positionLength + - match: { tokens.1.token: atm } + - match: { tokens.1.position: 1 } + - match: { tokens.1.positionLength: 3 } + - match: { tokens.2.token: cash } + - match: { tokens.2.position: 1 } + - is_false: tokens.2.positionLength + - match: { tokens.3.token: automatic } + - match: { tokens.3.position: 1 } + - is_false: tokens.3.positionLength + - match: { tokens.4.token: point } + - match: { tokens.4.position: 2 } + - match: { tokens.4.positionLength: 2 } + - match: { tokens.5.token: teller } + - match: { tokens.5.position: 2 } + - is_false: tokens.5.positionLength + - match: { tokens.6.token: machine } + - match: { tokens.6.position: 3 } + - is_false: tokens.6.positionLength + - match: { tokens.7.token: is } + - match: { tokens.7.position: 4 } + - is_false: tokens.7.positionLength + - match: { tokens.8.token: down } + - match: { tokens.8.position: 5 } + - is_false: tokens.8.positionLength + +--- +"length": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_length: + type: length + min: 6 + - do: + indices.analyze: + index: test + body: + text: foo bar foobar + tokenizer: whitespace + filter: [my_length] + - length: { tokens: 1 } + - match: { tokens.0.token: foobar } + +--- +"uppercase": + - do: + indices.analyze: + body: + text: foobar + tokenizer: keyword + filter: [uppercase] + - length: { tokens: 1 } + - match: { tokens.0.token: FOOBAR } + +--- +"ngram": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_ngram: + type: ngram + min_gram: 3 + max_gram: 3 + - do: + indices.analyze: + index: test + body: + text: foobar + tokenizer: keyword + filter: [my_ngram] + - length: { tokens: 4 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: oob } + - match: { tokens.2.token: oba } + - match: { tokens.3.token: bar } + +--- +"edge_ngram": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_edge_ngram: + type: edge_ngram + min_gram: 3 + max_gram: 6 + - do: + indices.analyze: + index: test + body: + text: foobar + tokenizer: keyword + filter: [my_edge_ngram] + - length: { tokens: 4 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: foob } + - match: { tokens.2.token: fooba } + - match: { tokens.3.token: foobar } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/20_ngram_search.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/20_ngram_search.yml new file mode 100644 index 00000000000..eb8c9789a63 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/20_ngram_search.yml @@ -0,0 +1,41 @@ +"ngram search": + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + analysis: + analyzer: + my_analyzer: + tokenizer: standard + filter: [my_ngram] + filter: + my_ngram: + type: ngram + min: 2, + max: 2 + mappings: + doc: + properties: + text: + type: text + analyzer: my_analyzer + + - do: + index: + index: test + type: doc + id: 1 + body: { "text": "foo bar baz" } + refresh: true + + - do: + search: + body: + query: + match: + text: + query: foa + - match: {hits.total: 1} diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml new file mode 100644 index 00000000000..b04496965eb --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml @@ -0,0 +1,129 @@ +"ngram highlighting": + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + analysis: + tokenizer: + my_ngramt: + type: ngram + min_gram: 1 + max_gram: 20 + token_chars: letter,digit + filter: + my_ngram: + type: ngram + min_gram: 1 + max_gram: 20 + analyzer: + name2_index_analyzer: + tokenizer: whitespace + filter: [my_ngram] + name_index_analyzer: + tokenizer: my_ngramt + name_search_analyzer: + tokenizer: whitespace + mappings: + doc: + properties: + name: + type: text + term_vector: with_positions_offsets + analyzer: name_index_analyzer + search_analyzer: name_search_analyzer + name2: + type: text + term_vector: with_positions_offsets + analyzer: name2_index_analyzer + search_analyzer: name_search_analyzer + + - do: + index: + index: test + type: doc + id: 1 + refresh: true + body: + name: logicacmg ehemals avinci - the know how company + name2: logicacmg ehemals avinci - the know how company + + - do: + search: + body: + query: + match: + name: + query: logica m + highlight: + fields: + - name: {} + - match: {hits.total: 1} + - match: {hits.hits.0.highlight.name.0: "logicacmg ehemals avinci - the know how company"} + + - do: + search: + body: + query: + match: + name: + query: logica ma + highlight: + fields: + - name: {} + - match: {hits.total: 1} + - match: {hits.hits.0.highlight.name.0: "logicacmg ehemals avinci - the know how company"} + + - do: + search: + body: + query: + match: + name: + query: logica + highlight: + fields: + - name: {} + - match: {hits.total: 1} + - match: {hits.hits.0.highlight.name.0: "logicacmg ehemals avinci - the know how company"} + + - do: + search: + body: + query: + match: + name2: + query: logica m + highlight: + fields: + - name2: {} + - match: {hits.total: 1} + - match: {hits.hits.0.highlight.name2.0: "logicacmg ehemals avinci - the know how company"} + + - do: + search: + body: + query: + match: + name2: + query: logica ma + highlight: + fields: + - name2: {} + - match: {hits.total: 1} + - match: {hits.hits.0.highlight.name2.0: "logicacmg ehemals avinci - the know how company"} + + - do: + search: + body: + query: + match: + name2: + query: logica + highlight: + fields: + - name2: {} + - match: {hits.total: 1} + - match: {hits.hits.0.highlight.name2.0: "logicacmg ehemals avinci - the know how company"} diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 0c2a29224f8..76d170f7c2c 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -22,7 +22,6 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; -import org.elasticsearch.Version; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; @@ -36,10 +35,8 @@ import org.elasticsearch.index.analysis.CommonGramsTokenFilterFactory; import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory; import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; -import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; import org.elasticsearch.index.analysis.ElisionTokenFilterFactory; -import org.elasticsearch.index.analysis.FlattenGraphTokenFilterFactory; import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory; import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory; import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory; @@ -49,14 +46,11 @@ import org.elasticsearch.index.analysis.KStemTokenFilterFactory; import org.elasticsearch.index.analysis.KeepTypesFilterFactory; import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordTokenizerFactory; -import org.elasticsearch.index.analysis.LengthTokenFilterFactory; import org.elasticsearch.index.analysis.LetterTokenizerFactory; import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; -import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory; import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory; import org.elasticsearch.index.analysis.MinHashTokenFilterFactory; import org.elasticsearch.index.analysis.MultiTermAwareComponent; -import org.elasticsearch.index.analysis.NGramTokenFilterFactory; import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; import org.elasticsearch.index.analysis.PatternCaptureGroupTokenFilterFactory; @@ -82,7 +76,6 @@ import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; import org.elasticsearch.index.analysis.ThaiTokenizerFactory; import org.elasticsearch.index.analysis.TruncateTokenFilterFactory; import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; -import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; @@ -90,7 +83,6 @@ import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import java.util.Collection; -import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.Locale; @@ -165,7 +157,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("decimaldigit", DecimalDigitFilterFactory.class) .put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class) .put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class) - .put("edgengram", EdgeNGramTokenFilterFactory.class) + .put("edgengram", MovedToAnalysisCommon.class) .put("elision", ElisionTokenFilterFactory.class) .put("englishminimalstem", StemmerTokenFilterFactory.class) .put("englishpossessive", StemmerTokenFilterFactory.class) @@ -178,7 +170,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("germanlightstem", StemmerTokenFilterFactory.class) .put("germanminimalstem", StemmerTokenFilterFactory.class) .put("germannormalization", GermanNormalizationFilterFactory.class) - .put("greeklowercase", LowerCaseTokenFilterFactory.class) + .put("greeklowercase", MovedToAnalysisCommon.class) .put("greekstem", StemmerTokenFilterFactory.class) .put("hindinormalization", HindiNormalizationFilterFactory.class) .put("hindistem", StemmerTokenFilterFactory.class) @@ -186,17 +178,17 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("hunspellstem", HunspellTokenFilterFactory.class) .put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class) .put("indicnormalization", IndicNormalizationFilterFactory.class) - .put("irishlowercase", LowerCaseTokenFilterFactory.class) + .put("irishlowercase", MovedToAnalysisCommon.class) .put("indonesianstem", StemmerTokenFilterFactory.class) .put("italianlightstem", StemmerTokenFilterFactory.class) .put("keepword", KeepWordFilterFactory.class) .put("keywordmarker", MovedToAnalysisCommon.class) .put("kstem", KStemTokenFilterFactory.class) .put("latvianstem", StemmerTokenFilterFactory.class) - .put("length", LengthTokenFilterFactory.class) + .put("length", MovedToAnalysisCommon.class) .put("limittokencount", LimitTokenCountFilterFactory.class) - .put("lowercase", LowerCaseTokenFilterFactory.class) - .put("ngram", NGramTokenFilterFactory.class) + .put("lowercase", MovedToAnalysisCommon.class) + .put("ngram", MovedToAnalysisCommon.class) .put("norwegianlightstem", StemmerTokenFilterFactory.class) .put("norwegianminimalstem", StemmerTokenFilterFactory.class) .put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class) @@ -225,12 +217,12 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("synonymgraph", SynonymGraphTokenFilterFactory.class) .put("trim", MovedToAnalysisCommon.class) .put("truncate", TruncateTokenFilterFactory.class) - .put("turkishlowercase", LowerCaseTokenFilterFactory.class) + .put("turkishlowercase", MovedToAnalysisCommon.class) .put("type", KeepTypesFilterFactory.class) - .put("uppercase", UpperCaseTokenFilterFactory.class) + .put("uppercase", MovedToAnalysisCommon.class) .put("worddelimiter", MovedToAnalysisCommon.class) .put("worddelimitergraph", MovedToAnalysisCommon.class) - .put("flattengraph", FlattenGraphTokenFilterFactory.class) + .put("flattengraph", MovedToAnalysisCommon.class) // TODO: these tokenfilters are not yet exposed: useful?