From a34f5fa8127595534d919646d73dd7a88c21fa65 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Fri, 23 Jun 2017 21:22:14 +0200 Subject: [PATCH] Move more token filters to analysis-common module The following token filters were moved: stemmer, stemmer_override, kstem, dictionary_decompounder, hyphenation_decompounder, reverse, elision and truncate. Relates to #23658 --- .../resources/checkstyle_suppressions.xml | 1 - ...bstractCompoundWordTokenFilterFactory.java | 4 +- .../indices/analysis/AnalysisModule.java | 16 -- .../indices/analysis/AnalysisModuleTests.java | 14 +- .../indices/analyze/AnalyzeActionIT.java | 8 +- .../search/suggest/SuggestSearchIT.java | 29 +-- .../elasticsearch/index/analysis/test1.json | 8 - .../elasticsearch/index/analysis/test1.yml | 6 - .../analysis/common/CommonAnalysisPlugin.java | 8 + ...tionaryCompoundWordTokenFilterFactory.java | 4 +- .../common}/ElisionTokenFilterFactory.java | 7 +- ...enationCompoundWordTokenFilterFactory.java | 8 +- .../common}/KStemTokenFilterFactory.java | 5 +- .../common}/ReverseTokenFilterFactory.java | 5 +- .../StemmerOverrideTokenFilterFactory.java | 6 +- .../common}/StemmerTokenFilterFactory.java | 5 +- .../common}/TruncateTokenFilterFactory.java | 5 +- .../common/CommonAnalysisFactoryTests.java | 34 ++++ .../common}/CompoundAnalysisTests.java | 37 ++-- .../StemmerTokenFilterFactoryTests.java | 13 +- .../test/analysis-common/40_token_filters.yml | 176 ++++++++++++++++++ .../test/search.suggest/20_phrase.yml | 64 +++++++ .../analysis}/MyFilterTokenFilterFactory.java | 2 +- .../analysis/AnalysisFactoryTestCase.java | 72 ++++--- .../elasticsearch/analysis/common/test1.json | 54 ++++++ .../elasticsearch/analysis/common/test1.yml | 39 ++++ 26 files changed, 475 insertions(+), 155 deletions(-) rename core/src/main/java/org/elasticsearch/{index/analysis/compound => analysis/common}/AbstractCompoundWordTokenFilterFactory.java (93%) rename {core/src/main/java/org/elasticsearch/index/analysis/compound => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/DictionaryCompoundWordTokenFilterFactory.java (90%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/ElisionTokenFilterFactory.java (82%) rename {core/src/main/java/org/elasticsearch/index/analysis/compound => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/HyphenationCompoundWordTokenFilterFactory.java (88%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/KStemTokenFilterFactory.java (84%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/ReverseTokenFilterFactory.java (85%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/StemmerOverrideTokenFilterFactory.java (90%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/StemmerTokenFilterFactory.java (98%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/TruncateTokenFilterFactory.java (86%) rename {core/src/test/java/org/elasticsearch/index/analysis => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/CompoundAnalysisTests.java (84%) rename {core/src/test/java/org/elasticsearch/index/analysis => modules/analysis-common/src/test/java/org/elasticsearch/analysis/common}/StemmerTokenFilterFactoryTests.java (90%) rename {core/src/test/java/org/elasticsearch/index/analysis/filter1 => test/framework/src/main/java/org/elasticsearch/index/analysis}/MyFilterTokenFilterFactory.java (96%) create mode 100644 test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json create mode 100644 test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.yml diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml index 79e4e744445..4c62693a34a 100644 --- a/buildSrc/src/main/resources/checkstyle_suppressions.xml +++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml @@ -267,7 +267,6 @@ - diff --git a/core/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java similarity index 93% rename from core/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java rename to core/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java index 91c984c7a6b..b59cc166f09 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/compound/AbstractCompoundWordTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis.compound; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; @@ -38,7 +38,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok protected final boolean onlyLongestMatch; protected final CharArraySet wordList; - public AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 9220c063715..2657c9f7981 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -55,7 +55,6 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.DutchAnalyzerProvider; import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; -import org.elasticsearch.index.analysis.ElisionTokenFilterFactory; import org.elasticsearch.index.analysis.EnglishAnalyzerProvider; import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider; import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory; @@ -75,7 +74,6 @@ import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory; import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider; import org.elasticsearch.index.analysis.IrishAnalyzerProvider; import org.elasticsearch.index.analysis.ItalianAnalyzerProvider; -import org.elasticsearch.index.analysis.KStemTokenFilterFactory; import org.elasticsearch.index.analysis.KeepTypesFilterFactory; import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordAnalyzerProvider; @@ -99,7 +97,6 @@ import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; -import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.RomanianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory; @@ -116,8 +113,6 @@ import org.elasticsearch.index.analysis.StandardAnalyzerProvider; import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider; import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; -import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory; -import org.elasticsearch.index.analysis.StemmerTokenFilterFactory; import org.elasticsearch.index.analysis.StopAnalyzerProvider; import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.SwedishAnalyzerProvider; @@ -125,13 +120,10 @@ import org.elasticsearch.index.analysis.ThaiAnalyzerProvider; import org.elasticsearch.index.analysis.ThaiTokenizerFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; -import org.elasticsearch.index.analysis.TruncateTokenFilterFactory; import org.elasticsearch.index.analysis.TurkishAnalyzerProvider; import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; -import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; -import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.plugins.AnalysisPlugin; import java.io.IOException; @@ -201,23 +193,16 @@ public final class AnalysisModule { hunspellService) { NamedRegistry> tokenFilters = new NamedRegistry<>("token_filter"); tokenFilters.register("stop", StopTokenFilterFactory::new); - tokenFilters.register("reverse", ReverseTokenFilterFactory::new); - tokenFilters.register("kstem", KStemTokenFilterFactory::new); tokenFilters.register("standard", StandardTokenFilterFactory::new); tokenFilters.register("shingle", ShingleTokenFilterFactory::new); tokenFilters.register("min_hash", MinHashTokenFilterFactory::new); - tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new)); tokenFilters.register("limit", LimitTokenCountFilterFactory::new); tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); - tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); - tokenFilters.register("elision", ElisionTokenFilterFactory::new); tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new)); tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new)); tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); tokenFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new)); - tokenFilters.register("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); - tokenFilters.register("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new)); tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new); tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new); tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new); @@ -225,7 +210,6 @@ public final class AnalysisModule { tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new); tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new); tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new); - tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new)); tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new); tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new); tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new); diff --git a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index b3394d4f4fa..a740f96cdd8 100644 --- a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -47,7 +47,7 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; -import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory; +import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; @@ -196,18 +196,6 @@ public class AnalysisModuleTests extends ESTestCase { // assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class)); // assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4)); // assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class)); -// -// // check dictionary decompounder -// analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer(); -// assertThat(analyzer, instanceOf(CustomAnalyzer.class)); -// CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer; -// assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class)); -// assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1)); -// assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class)); - - Set wordList = Analysis.getWordSet(null, Version.CURRENT, settings, "index.analysis.filter.dict_dec.word_list"); - MatcherAssert.assertThat(wordList.size(), equalTo(6)); -// MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe")); } public void testWordListPath() throws Exception { diff --git a/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java b/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java index dd556c56e30..6e0c61c1544 100644 --- a/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java +++ b/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java @@ -93,16 +93,16 @@ public class AnalyzeActionIT extends ESIntegTestCase { assertThat(analyzeResponse.getTokens().size(), equalTo(1)); assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test")); - analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").addTokenFilter("reverse").get(); + analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").get(); assertThat(analyzeResponse.getTokens().size(), equalTo(4)); AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0); - assertThat(token.getTerm(), equalTo("siht")); + assertThat(token.getTerm(), equalTo("this")); token = analyzeResponse.getTokens().get(1); - assertThat(token.getTerm(), equalTo("si")); + assertThat(token.getTerm(), equalTo("is")); token = analyzeResponse.getTokens().get(2); assertThat(token.getTerm(), equalTo("a")); token = analyzeResponse.getTokens().get(3); - assertThat(token.getTerm(), equalTo("tset")); + assertThat(token.getTerm(), equalTo("test")); analyzeResponse = client().admin().indices().prepareAnalyze("of course").setTokenizer("standard").addTokenFilter("stop").get(); assertThat(analyzeResponse.getTokens().size(), equalTo(1)); diff --git a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java index 035fd847ad2..5142c25229d 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java @@ -445,8 +445,6 @@ public class SuggestSearchIT extends ESIntegTestCase { public void testPrefixLength() throws IOException { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() .put(SETTING_NUMBER_OF_SHARDS, 1) - .put("index.analysis.analyzer.reverse.tokenizer", "standard") - .putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse") .put("index.analysis.analyzer.body.tokenizer", "standard") .putArray("index.analysis.analyzer.body.filter", "lowercase") .put("index.analysis.analyzer.bigram.tokenizer", "standard") @@ -458,7 +456,6 @@ public class SuggestSearchIT extends ESIntegTestCase { XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("body").field("type", "text").field("analyzer", "body").endObject() - .startObject("body_reverse").field("type", "text").field("analyzer", "reverse").endObject() .startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject() .endObject() .endObject().endObject(); @@ -486,8 +483,6 @@ public class SuggestSearchIT extends ESIntegTestCase { public void testBasicPhraseSuggest() throws IOException, URISyntaxException { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() .put(indexSettings()) - .put("index.analysis.analyzer.reverse.tokenizer", "standard") - .putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse") .put("index.analysis.analyzer.body.tokenizer", "standard") .putArray("index.analysis.analyzer.body.filter", "lowercase") .put("index.analysis.analyzer.bigram.tokenizer", "standard") @@ -503,10 +498,6 @@ public class SuggestSearchIT extends ESIntegTestCase { field("type", "text"). field("analyzer", "body") .endObject() - .startObject("body_reverse"). - field("type", "text"). - field("analyzer", "reverse") - .endObject() .startObject("bigram"). field("type", "text"). field("analyzer", "bigram") @@ -536,7 +527,7 @@ public class SuggestSearchIT extends ESIntegTestCase { "Police sergeant who stops the film", }; for (String line : strings) { - index("test", "type1", line, "body", line, "body_reverse", line, "bigram", line); + index("test", "type1", line, "body", line, "bigram", line); } refresh(); @@ -576,14 +567,6 @@ public class SuggestSearchIT extends ESIntegTestCase { searchSuggest = searchSuggest( "Arthur, King of the Britons", "simple_phrase", phraseSuggest); assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons"); - //test reverse suggestions with pre & post filter - phraseSuggest - .addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always")) - .addCandidateGenerator(candidateGenerator("body_reverse").minWordLength(1).suggestMode("always").preFilter("reverse") - .postFilter("reverse")); - searchSuggest = searchSuggest( "Artur, Ging of the Britons", "simple_phrase", phraseSuggest); - assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons"); - // set all mass to trigrams (not indexed) phraseSuggest.clearCandidateGenerators() .addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always")) @@ -633,8 +616,6 @@ public class SuggestSearchIT extends ESIntegTestCase { public void testSizeParam() throws IOException { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() .put(SETTING_NUMBER_OF_SHARDS, 1) - .put("index.analysis.analyzer.reverse.tokenizer", "standard") - .putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse") .put("index.analysis.analyzer.body.tokenizer", "standard") .putArray("index.analysis.analyzer.body.filter", "lowercase") .put("index.analysis.analyzer.bigram.tokenizer", "standard") @@ -652,10 +633,6 @@ public class SuggestSearchIT extends ESIntegTestCase { .field("type", "text") .field("analyzer", "body") .endObject() - .startObject("body_reverse") - .field("type", "text") - .field("analyzer", "reverse") - .endObject() .startObject("bigram") .field("type", "text") .field("analyzer", "bigram") @@ -667,9 +644,9 @@ public class SuggestSearchIT extends ESIntegTestCase { ensureGreen(); String line = "xorr the god jewel"; - index("test", "type1", "1", "body", line, "body_reverse", line, "bigram", line); + index("test", "type1", "1", "body", line, "bigram", line); line = "I got it this time"; - index("test", "type1", "2", "body", line, "body_reverse", line, "bigram", line); + index("test", "type1", "2", "body", line, "bigram", line); refresh(); PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram") diff --git a/core/src/test/resources/org/elasticsearch/index/analysis/test1.json b/core/src/test/resources/org/elasticsearch/index/analysis/test1.json index 38937a9b5af..f2b60017721 100644 --- a/core/src/test/resources/org/elasticsearch/index/analysis/test1.json +++ b/core/src/test/resources/org/elasticsearch/index/analysis/test1.json @@ -17,10 +17,6 @@ }, "my":{ "type":"myfilter" - }, - "dict_dec":{ - "type":"dictionary_decompounder", - "word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"] } }, "analyzer":{ @@ -43,10 +39,6 @@ "czechAnalyzerWithStemmer":{ "tokenizer":"standard", "filter":["standard", "lowercase", "stop", "czech_stem"] - }, - "decompoundingAnalyzer":{ - "tokenizer":"standard", - "filter":["dict_dec"] } } } diff --git a/core/src/test/resources/org/elasticsearch/index/analysis/test1.yml b/core/src/test/resources/org/elasticsearch/index/analysis/test1.yml index f7a57d14dbe..e9965467251 100644 --- a/core/src/test/resources/org/elasticsearch/index/analysis/test1.yml +++ b/core/src/test/resources/org/elasticsearch/index/analysis/test1.yml @@ -12,9 +12,6 @@ index : stopwords : [stop2-1, stop2-2] my : type : myfilter - dict_dec : - type : dictionary_decompounder - word_list : [donau, dampf, schiff, spargel, creme, suppe] analyzer : standard : type : standard @@ -34,6 +31,3 @@ index : czechAnalyzerWithStemmer : tokenizer : standard filter : [standard, lowercase, stop, czech_stem] - decompoundingAnalyzer : - tokenizer : standard - filter : [dict_dec] diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 0299e37affc..18e34d381a1 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -107,6 +107,14 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { filters.put("ngram", NGramTokenFilterFactory::new); filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new); filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); + filters.put("stemmer", StemmerTokenFilterFactory::new); + filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new)); + filters.put("kstem", KStemTokenFilterFactory::new); + filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); + filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new)); + filters.put("reverse", ReverseTokenFilterFactory::new); + filters.put("elision", ElisionTokenFilterFactory::new); + filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new)); return filters; } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DictionaryCompoundWordTokenFilterFactory.java similarity index 90% rename from core/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DictionaryCompoundWordTokenFilterFactory.java index fc9719d36b1..e9e690e0b01 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DictionaryCompoundWordTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis.compound; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter; @@ -33,7 +33,7 @@ import org.elasticsearch.index.IndexSettings; */ public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory { - public DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, env, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ElisionTokenFilterFactory.java similarity index 82% rename from core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ElisionTokenFilterFactory.java index 401f2caf03f..94fc52165dd 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ElisionTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; @@ -25,12 +25,15 @@ import org.apache.lucene.analysis.util.ElisionFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { private final CharArraySet articles; - public ElisionTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + ElisionTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); this.articles = Analysis.parseArticles(env, indexSettings.getIndexVersionCreated(), settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java similarity index 88% rename from core/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java index 152d4395ef3..b24eb2c4fbc 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis.compound; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; @@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.xml.sax.InputSource; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; @@ -39,7 +40,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW private final HyphenationTree hyphenationTree; - public HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, env, name, settings); String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null); @@ -50,7 +51,8 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath); try { - hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(Files.newInputStream(hyphenationPatternsFile))); + InputStream in = Files.newInputStream(hyphenationPatternsFile); + hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in)); } catch (Exception e) { throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/KStemTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KStemTokenFilterFactory.java similarity index 84% rename from core/src/main/java/org/elasticsearch/index/analysis/KStemTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KStemTokenFilterFactory.java index 24f92ece101..2100e02fb61 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/KStemTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KStemTokenFilterFactory.java @@ -17,17 +17,18 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.KStemFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class KStemTokenFilterFactory extends AbstractTokenFilterFactory { - public KStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + KStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ReverseTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ReverseTokenFilterFactory.java similarity index 85% rename from core/src/main/java/org/elasticsearch/index/analysis/ReverseTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ReverseTokenFilterFactory.java index 1719841098d..125e1e496b9 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ReverseTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ReverseTokenFilterFactory.java @@ -17,17 +17,18 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class ReverseTokenFilterFactory extends AbstractTokenFilterFactory { - public ReverseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + ReverseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/StemmerOverrideTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerOverrideTokenFilterFactory.java similarity index 90% rename from core/src/main/java/org/elasticsearch/index/analysis/StemmerOverrideTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerOverrideTokenFilterFactory.java index 66643cc2396..f95b4ed76e7 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/StemmerOverrideTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerOverrideTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; @@ -26,6 +26,8 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.Analysis; import java.io.IOException; import java.util.List; @@ -34,7 +36,7 @@ public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactor private final StemmerOverrideMap overrideMap; - public StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException { + StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException { super(indexSettings, name, settings); List rules = Analysis.getWordList(env, settings, "rules"); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java similarity index 98% rename from core/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index bf83876259b..c94a449afd2 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicStemFilter; @@ -57,6 +57,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.tartarus.snowball.ext.ArmenianStemmer; import org.tartarus.snowball.ext.BasqueStemmer; import org.tartarus.snowball.ext.CatalanStemmer; @@ -86,7 +87,7 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { private String language; - public StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter"))); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/TruncateTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/TruncateTokenFilterFactory.java similarity index 86% rename from core/src/main/java/org/elasticsearch/index/analysis/TruncateTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/TruncateTokenFilterFactory.java index 49ea7d6940d..82311964664 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/TruncateTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/TruncateTokenFilterFactory.java @@ -17,19 +17,20 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory { private final int length; - public TruncateTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + TruncateTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.length = settings.getAsInt("length", -1); if (length <= 0) { diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index a7dd2614452..37bf407df03 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; import java.util.List; @@ -67,6 +68,39 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { filters.put("uppercase", UpperCaseTokenFilterFactory.class); filters.put("ngram", NGramTokenFilterFactory.class); filters.put("edgengram", EdgeNGramTokenFilterFactory.class); + filters.put("bulgarianstem", StemmerTokenFilterFactory.class); + filters.put("englishminimalstem", StemmerTokenFilterFactory.class); + filters.put("englishpossessive", StemmerTokenFilterFactory.class); + filters.put("finnishlightstem", StemmerTokenFilterFactory.class); + filters.put("frenchlightstem", StemmerTokenFilterFactory.class); + filters.put("frenchminimalstem", StemmerTokenFilterFactory.class); + filters.put("galicianminimalstem", StemmerTokenFilterFactory.class); + filters.put("galicianstem", StemmerTokenFilterFactory.class); + filters.put("germanlightstem", StemmerTokenFilterFactory.class); + filters.put("germanminimalstem", StemmerTokenFilterFactory.class); + filters.put("greekstem", StemmerTokenFilterFactory.class); + filters.put("hindistem", StemmerTokenFilterFactory.class); + filters.put("hungarianlightstem", StemmerTokenFilterFactory.class); + filters.put("indonesianstem", StemmerTokenFilterFactory.class); + filters.put("italianlightstem", StemmerTokenFilterFactory.class); + filters.put("latvianstem", StemmerTokenFilterFactory.class); + filters.put("norwegianlightstem", StemmerTokenFilterFactory.class); + filters.put("norwegianminimalstem", StemmerTokenFilterFactory.class); + filters.put("portuguesestem", StemmerTokenFilterFactory.class); + filters.put("portugueselightstem", StemmerTokenFilterFactory.class); + filters.put("portugueseminimalstem", StemmerTokenFilterFactory.class); + filters.put("russianlightstem", StemmerTokenFilterFactory.class); + filters.put("soranistem", StemmerTokenFilterFactory.class); + filters.put("spanishlightstem", StemmerTokenFilterFactory.class); + filters.put("swedishlightstem", StemmerTokenFilterFactory.class); + filters.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class); + filters.put("kstem", KStemTokenFilterFactory.class); + filters.put("synonym", SynonymTokenFilterFactory.class); + filters.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class); + filters.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class); + filters.put("reversestring", ReverseTokenFilterFactory.class); + filters.put("elision", ElisionTokenFilterFactory.class); + filters.put("truncate", TruncateTokenFilterFactory.class); return filters; } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CompoundAnalysisTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CompoundAnalysisTests.java similarity index 84% rename from core/src/test/java/org/elasticsearch/index/analysis/CompoundAnalysisTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CompoundAnalysisTests.java index e8734331167..13b512f86e0 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CompoundAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CompoundAnalysisTests.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -29,8 +29,9 @@ import org.elasticsearch.common.lucene.all.AllTokenStream; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; -import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; @@ -40,10 +41,10 @@ import org.hamcrest.MatcherAssert; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; -import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasItems; @@ -53,12 +54,7 @@ public class CompoundAnalysisTests extends ESTestCase { public void testDefaultsCompoundAnalysis() throws Exception { Settings settings = getJsonSettings(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); - AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() { - @Override - public Map> getTokenFilters() { - return singletonMap("myfilter", MyFilterTokenFilterFactory::new); - } - })); + AnalysisModule analysisModule = createAnalysisModule(settings); TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec"); MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class)); } @@ -75,12 +71,7 @@ public class CompoundAnalysisTests extends ESTestCase { private List analyze(Settings settings, String analyzerName, String text) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); - AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() { - @Override - public Map> getTokenFilters() { - return singletonMap("myfilter", MyFilterTokenFilterFactory::new); - } - })); + AnalysisModule analysisModule = createAnalysisModule(settings); IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings); Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); @@ -99,8 +90,18 @@ public class CompoundAnalysisTests extends ESTestCase { return terms; } + private AnalysisModule createAnalysisModule(Settings settings) throws IOException { + CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin(); + return new AnalysisModule(new Environment(settings), Arrays.asList(commonAnalysisPlugin, new AnalysisPlugin() { + @Override + public Map> getTokenFilters() { + return singletonMap("myfilter", MyFilterTokenFilterFactory::new); + } + })); + } + private Settings getJsonSettings() throws IOException { - String json = "/org/elasticsearch/index/analysis/test1.json"; + String json = "/org/elasticsearch/analysis/common/test1.json"; return Settings.builder() .loadFromStream(json, getClass().getResourceAsStream(json)) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) @@ -109,7 +110,7 @@ public class CompoundAnalysisTests extends ESTestCase { } private Settings getYamlSettings() throws IOException { - String yaml = "/org/elasticsearch/index/analysis/test1.yml"; + String yaml = "/org/elasticsearch/analysis/common/test1.yml"; return Settings.builder() .loadFromStream(yaml, getClass().getResourceAsStream(yaml)) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) diff --git a/core/src/test/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java similarity index 90% rename from core/src/test/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index c4632e57490..10f7653c52c 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -26,6 +26,10 @@ import org.apache.lucene.analysis.snowball.SnowballFilter; import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; import org.elasticsearch.test.VersionUtils; @@ -38,6 +42,9 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_C import static org.hamcrest.Matchers.instanceOf; public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase { + + private static final CommonAnalysisPlugin PLUGIN = new CommonAnalysisPlugin(); + public void testEnglishFilterFactory() throws IOException { int iters = scaledRandomIntBetween(20, 100); for (int i = 0; i < iters; i++) { @@ -51,7 +58,7 @@ public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); @@ -79,7 +86,7 @@ public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2"); assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 1d3075e28f8..2283634a80a 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -392,3 +392,179 @@ - match: { tokens.1.token: foob } - match: { tokens.2.token: fooba } - match: { tokens.3.token: foobar } + +--- +"kstem": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_kstem: + type: kstem + - do: + indices.analyze: + index: test + body: + text: bricks + tokenizer: keyword + filter: [my_kstem] + - length: { tokens: 1 } + - match: { tokens.0.token: brick } + + # use preconfigured token filter: + - do: + indices.analyze: + body: + text: bricks + tokenizer: keyword + filter: [kstem] + - length: { tokens: 1 } + - match: { tokens.0.token: brick } + +--- +"reverse": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_reverse: + type: reverse + - do: + indices.analyze: + index: test + body: + text: foobar + tokenizer: keyword + filter: [my_reverse] + - length: { tokens: 1 } + - match: { tokens.0.token: raboof } + + # use preconfigured token filter: + - do: + indices.analyze: + body: + text: foobar + tokenizer: keyword + filter: [reverse] + - length: { tokens: 1 } + - match: { tokens.0.token: raboof } + +--- +"elision": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_elision: + type: elision + articles: ["l", "m", "t", "qu", "n", "s", "j"] + - do: + indices.analyze: + index: test + body: + text: "l'avion" + tokenizer: keyword + filter: [my_elision] + - length: { tokens: 1 } + - match: { tokens.0.token: avion } + +--- +"stemmer": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_stemmer: + type: stemmer + language: dutch + - do: + indices.analyze: + index: test + body: + text: zoeken + tokenizer: keyword + filter: [my_stemmer] + - length: { tokens: 1 } + - match: { tokens.0.token: zoek } +--- +"stemmer_override": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_stemmer: + type: stemmer + language: dutch + my_stemmer_override: + type: stemmer_override + rules: ["zoeken => override"] + - do: + indices.analyze: + index: test + body: + text: zoeken + tokenizer: keyword + filter: [my_stemmer_override, my_stemmer] + - length: { tokens: 1 } + - match: { tokens.0.token: override } + +--- +"decompounder": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_decompounder: + type: dictionary_decompounder + word_list: [foo, bar] + - do: + indices.analyze: + index: test + body: + text: foobar + tokenizer: keyword + filter: [my_decompounder] + - length: { tokens: 3 } + - match: { tokens.0.token: foobar } + - match: { tokens.1.token: foo } + - match: { tokens.2.token: bar } + +--- +"truncate": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_truncate: + type: truncate + length: 3 + - do: + indices.analyze: + index: test + body: + text: foobar + tokenizer: keyword + filter: [my_truncate] + - length: { tokens: 1 } + - match: { tokens.0.token: foo } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yml index cf5ebcea42e..18c3c814625 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yml @@ -19,6 +19,9 @@ setup: ngram: tokenizer: standard filter: [lowercase, ngram] + reverse: + tokenizer: standard + filter: [lowercase, reverse] filter: bigram: type: shingle @@ -43,6 +46,9 @@ setup: ngram: type: text analyzer: ngram + reverse: + type: text + analyzer: reverse - do: bulk: @@ -54,6 +60,40 @@ setup: { "body": "Xorr the God-Jewel" } { "index": {} } { "body": "Xorn" } + { "index": {} } + { "body": "Arthur, King of the Britons" } + { "index": {} } + { "body": "Sir Lancelot the Brave" } + { "index": {} } + { "body": "Patsy, Arthur's Servant" } + { "index": {} } + { "body": "Sir Robin the Not-Quite-So-Brave-as-Sir-Lancelot" } + { "index": {} } + { "body": "Sir Bedevere the Wise" } + { "index": {} } + { "body": "Sir Galahad the Pure" } + { "index": {} } + { "body": "Miss Islington, the Witch" } + { "index": {} } + { "body": "Zoot" } + { "index": {} } + { "body": "Leader of Robin's Minstrels" } + { "index": {} } + { "body": "Old Crone" } + { "index": {} } + { "body": "Frank, the Historian" } + { "index": {} } + { "body": "Frank's Wife" } + { "index": {} } + { "body": "Dr. Piglet" } + { "index": {} } + { "body": "Dr. Winston" } + { "index": {} } + { "body": "Sir Robin (Stand-in)" } + { "index": {} } + { "body": "Knight Who Says Ni" } + { "index": {} } + { "body": "Police sergeant who stops the film" } --- "sorts by score": @@ -156,3 +196,27 @@ setup: field: body.bigram analyzer: bigram force_unigrams: false + +--- +"reverse suggestions": + - do: + search: + size: 0 + index: test + body: + suggest: + text: Artur, Ging of the Britons + test: + phrase: + field: body.ngram + force_unigrams: true + max_errors: 0.5 + direct_generator: + - field: body.reverse + min_word_length: 1 + suggest_mode: always + pre_filter: reverse + post_filter: reverse + + - match: {suggest.test.0.options.0.text: arthur king of the britons} + diff --git a/core/src/test/java/org/elasticsearch/index/analysis/filter1/MyFilterTokenFilterFactory.java b/test/framework/src/main/java/org/elasticsearch/index/analysis/MyFilterTokenFilterFactory.java similarity index 96% rename from core/src/test/java/org/elasticsearch/index/analysis/filter1/MyFilterTokenFilterFactory.java rename to test/framework/src/main/java/org/elasticsearch/index/analysis/MyFilterTokenFilterFactory.java index 1c9a4798139..921a09e98e6 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/filter1/MyFilterTokenFilterFactory.java +++ b/test/framework/src/main/java/org/elasticsearch/index/analysis/MyFilterTokenFilterFactory.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis.filter1; +package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 76d170f7c2c..97035623a6c 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -36,13 +36,11 @@ import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory; import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; -import org.elasticsearch.index.analysis.ElisionTokenFilterFactory; import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory; import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory; import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory; -import org.elasticsearch.index.analysis.KStemTokenFilterFactory; import org.elasticsearch.index.analysis.KeepTypesFilterFactory; import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordTokenizerFactory; @@ -60,7 +58,6 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; -import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; @@ -68,17 +65,12 @@ import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; -import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory; -import org.elasticsearch.index.analysis.StemmerTokenFilterFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory; import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; import org.elasticsearch.index.analysis.ThaiTokenizerFactory; -import org.elasticsearch.index.analysis.TruncateTokenFilterFactory; import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; -import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; -import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; @@ -147,7 +139,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("arabicstem", ArabicStemTokenFilterFactory.class) .put("asciifolding", MovedToAnalysisCommon.class) .put("brazilianstem", BrazilianStemTokenFilterFactory.class) - .put("bulgarianstem", StemmerTokenFilterFactory.class) + .put("bulgarianstem", MovedToAnalysisCommon.class) .put("cjkbigram", CJKBigramFilterFactory.class) .put("cjkwidth", CJKWidthFilterFactory.class) .put("classic", ClassicFilterFactory.class) @@ -156,50 +148,50 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("czechstem", CzechStemTokenFilterFactory.class) .put("decimaldigit", DecimalDigitFilterFactory.class) .put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class) - .put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class) + .put("dictionarycompoundword", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class) - .put("elision", ElisionTokenFilterFactory.class) - .put("englishminimalstem", StemmerTokenFilterFactory.class) - .put("englishpossessive", StemmerTokenFilterFactory.class) - .put("finnishlightstem", StemmerTokenFilterFactory.class) - .put("frenchlightstem", StemmerTokenFilterFactory.class) - .put("frenchminimalstem", StemmerTokenFilterFactory.class) - .put("galicianminimalstem", StemmerTokenFilterFactory.class) - .put("galicianstem", StemmerTokenFilterFactory.class) + .put("elision", MovedToAnalysisCommon.class) + .put("englishminimalstem", MovedToAnalysisCommon.class) + .put("englishpossessive", MovedToAnalysisCommon.class) + .put("finnishlightstem", MovedToAnalysisCommon.class) + .put("frenchlightstem", MovedToAnalysisCommon.class) + .put("frenchminimalstem", MovedToAnalysisCommon.class) + .put("galicianminimalstem", MovedToAnalysisCommon.class) + .put("galicianstem", MovedToAnalysisCommon.class) .put("germanstem", GermanStemTokenFilterFactory.class) - .put("germanlightstem", StemmerTokenFilterFactory.class) - .put("germanminimalstem", StemmerTokenFilterFactory.class) + .put("germanlightstem", MovedToAnalysisCommon.class) + .put("germanminimalstem", MovedToAnalysisCommon.class) .put("germannormalization", GermanNormalizationFilterFactory.class) .put("greeklowercase", MovedToAnalysisCommon.class) - .put("greekstem", StemmerTokenFilterFactory.class) + .put("greekstem", MovedToAnalysisCommon.class) .put("hindinormalization", HindiNormalizationFilterFactory.class) - .put("hindistem", StemmerTokenFilterFactory.class) - .put("hungarianlightstem", StemmerTokenFilterFactory.class) + .put("hindistem", MovedToAnalysisCommon.class) + .put("hungarianlightstem", MovedToAnalysisCommon.class) .put("hunspellstem", HunspellTokenFilterFactory.class) - .put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class) + .put("hyphenationcompoundword", MovedToAnalysisCommon.class) .put("indicnormalization", IndicNormalizationFilterFactory.class) .put("irishlowercase", MovedToAnalysisCommon.class) - .put("indonesianstem", StemmerTokenFilterFactory.class) - .put("italianlightstem", StemmerTokenFilterFactory.class) + .put("indonesianstem", MovedToAnalysisCommon.class) + .put("italianlightstem", MovedToAnalysisCommon.class) .put("keepword", KeepWordFilterFactory.class) .put("keywordmarker", MovedToAnalysisCommon.class) - .put("kstem", KStemTokenFilterFactory.class) - .put("latvianstem", StemmerTokenFilterFactory.class) + .put("kstem", MovedToAnalysisCommon.class) + .put("latvianstem", MovedToAnalysisCommon.class) .put("length", MovedToAnalysisCommon.class) .put("limittokencount", LimitTokenCountFilterFactory.class) .put("lowercase", MovedToAnalysisCommon.class) .put("ngram", MovedToAnalysisCommon.class) - .put("norwegianlightstem", StemmerTokenFilterFactory.class) - .put("norwegianminimalstem", StemmerTokenFilterFactory.class) + .put("norwegianlightstem", MovedToAnalysisCommon.class) + .put("norwegianminimalstem", MovedToAnalysisCommon.class) .put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class) .put("patternreplace", PatternReplaceTokenFilterFactory.class) .put("persiannormalization", PersianNormalizationFilterFactory.class) .put("porterstem", MovedToAnalysisCommon.class) - .put("portuguesestem", StemmerTokenFilterFactory.class) - .put("portugueselightstem", StemmerTokenFilterFactory.class) - .put("portugueseminimalstem", StemmerTokenFilterFactory.class) - .put("reversestring", ReverseTokenFilterFactory.class) - .put("russianlightstem", StemmerTokenFilterFactory.class) + .put("portuguesestem", MovedToAnalysisCommon.class) + .put("portugueselightstem", MovedToAnalysisCommon.class) + .put("portugueseminimalstem", MovedToAnalysisCommon.class) + .put("reversestring", MovedToAnalysisCommon.class) + .put("russianlightstem", MovedToAnalysisCommon.class) .put("scandinavianfolding", ScandinavianFoldingFilterFactory.class) .put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class) .put("serbiannormalization", SerbianNormalizationFilterFactory.class) @@ -207,16 +199,16 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("minhash", MinHashTokenFilterFactory.class) .put("snowballporter", MovedToAnalysisCommon.class) .put("soraninormalization", SoraniNormalizationFilterFactory.class) - .put("soranistem", StemmerTokenFilterFactory.class) - .put("spanishlightstem", StemmerTokenFilterFactory.class) + .put("soranistem", MovedToAnalysisCommon.class) + .put("spanishlightstem", MovedToAnalysisCommon.class) .put("standard", StandardTokenFilterFactory.class) - .put("stemmeroverride", StemmerOverrideTokenFilterFactory.class) + .put("stemmeroverride", MovedToAnalysisCommon.class) .put("stop", StopTokenFilterFactory.class) - .put("swedishlightstem", StemmerTokenFilterFactory.class) + .put("swedishlightstem", MovedToAnalysisCommon.class) .put("synonym", SynonymTokenFilterFactory.class) .put("synonymgraph", SynonymGraphTokenFilterFactory.class) .put("trim", MovedToAnalysisCommon.class) - .put("truncate", TruncateTokenFilterFactory.class) + .put("truncate", MovedToAnalysisCommon.class) .put("turkishlowercase", MovedToAnalysisCommon.class) .put("type", KeepTypesFilterFactory.class) .put("uppercase", MovedToAnalysisCommon.class) diff --git a/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json new file mode 100644 index 00000000000..38937a9b5af --- /dev/null +++ b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json @@ -0,0 +1,54 @@ +{ + "index":{ + "analysis":{ + "tokenizer":{ + "standard":{ + "type":"standard" + } + }, + "filter":{ + "stop":{ + "type":"stop", + "stopwords":["test-stop"] + }, + "stop2":{ + "type":"stop", + "stopwords":["stop2-1", "stop2-2"] + }, + "my":{ + "type":"myfilter" + }, + "dict_dec":{ + "type":"dictionary_decompounder", + "word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"] + } + }, + "analyzer":{ + "standard":{ + "type":"standard", + "stopwords":["test1", "test2", "test3"] + }, + "custom1":{ + "tokenizer":"standard", + "filter":["stop", "stop2"] + }, + "custom4":{ + "tokenizer":"standard", + "filter":["my"] + }, + "custom6":{ + "tokenizer":"standard", + "position_increment_gap": 256 + }, + "czechAnalyzerWithStemmer":{ + "tokenizer":"standard", + "filter":["standard", "lowercase", "stop", "czech_stem"] + }, + "decompoundingAnalyzer":{ + "tokenizer":"standard", + "filter":["dict_dec"] + } + } + } + } +} diff --git a/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.yml b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.yml new file mode 100644 index 00000000000..f7a57d14dbe --- /dev/null +++ b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.yml @@ -0,0 +1,39 @@ +index : + analysis : + tokenizer : + standard : + type : standard + filter : + stop : + type : stop + stopwords : [test-stop] + stop2 : + type : stop + stopwords : [stop2-1, stop2-2] + my : + type : myfilter + dict_dec : + type : dictionary_decompounder + word_list : [donau, dampf, schiff, spargel, creme, suppe] + analyzer : + standard : + type : standard + stopwords : [test1, test2, test3] + custom1 : + tokenizer : standard + filter : [stop, stop2] + custom4 : + tokenizer : standard + filter : [my] + custom6 : + tokenizer : standard + position_increment_gap: 256 + custom7 : + type : standard + version: 3.6 + czechAnalyzerWithStemmer : + tokenizer : standard + filter : [standard, lowercase, stop, czech_stem] + decompoundingAnalyzer : + tokenizer : standard + filter : [dict_dec]