From 65f2717ab7a07b8c7881db63cfa44ec429b95f30 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Wed, 10 May 2017 22:39:43 -0400 Subject: [PATCH] Make PreConfiguredTokenFilter harder to misuse (#24572) There are now three public static method to build instances of PreConfiguredTokenFilter and the ctor is private. I chose static methods instead of constructors because those allow us to change out the implementation returned if we so desire. Relates to #23658 --- .../analysis/PreConfiguredTokenFilter.java | 46 ++++++++---- .../indices/analysis/AnalysisModule.java | 13 ++-- .../analysis/PreBuiltTokenFilters.java | 15 ---- .../index/analysis/AnalysisRegistryTests.java | 6 +- .../index/analysis/CustomNormalizerTests.java | 3 +- .../index/mapper/KeywordFieldMapperTests.java | 3 +- .../indices/analysis/AnalysisModuleTests.java | 74 +++++++++++++++++++ .../analysis/common/CommonAnalysisPlugin.java | 45 ++++++----- .../common/CommonAnalysisFactoryTests.java | 4 + .../analysis/AnalysisFactoryTestCase.java | 4 - 10 files changed, 142 insertions(+), 71 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java index b410e8fb70e..1d9e4459c7e 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import java.io.IOException; import java.util.function.BiFunction; @@ -36,31 +37,46 @@ import java.util.function.Function; * Provides pre-configured, shared {@link TokenFilter}s. */ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider { + /** + * Create a pre-configured token filter that may not vary at all. + */ + public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, + Function create) { + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE, + (tokenStream, version) -> create.apply(tokenStream)); + } + + /** + * Create a pre-configured token filter that may vary based on the Lucene version. + */ + public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries, + BiFunction create) { + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE, + (tokenStream, version) -> create.apply(tokenStream, version.luceneVersion)); + } + + /** + * Create a pre-configured token filter that may vary based on the Elasticsearch version. + */ + public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries, + BiFunction create) { + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, + (tokenStream, version) -> create.apply(tokenStream, version)); + } + private final String name; private final boolean useFilterForMultitermQueries; private final PreBuiltCacheFactory.PreBuiltCache cache; private final BiFunction create; - /** - * Standard ctor with all the power. - */ - public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, - PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction create) { + private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + PreBuiltCacheFactory.CachingStrategy cache, BiFunction create) { this.name = name; this.useFilterForMultitermQueries = useFilterForMultitermQueries; - cache = PreBuiltCacheFactory.getCache(cachingStrategy); + this.cache = PreBuiltCacheFactory.getCache(cache); this.create = create; } - /** - * Convenience ctor for token streams that don't vary based on version. - */ - public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, - PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function create) { - this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input)); - // TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?! - } - @Override public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { return getTokenFilterFactory(Version.indexCreated(settings)); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 06ef3e315c6..ffe80a0f5f4 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -272,10 +272,8 @@ public final class AnalysisModule { NamedRegistry preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter"); // Add filters available in lucene-core - preConfiguredTokenFilters.register("lowercase", - new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new)); - preConfiguredTokenFilters.register("standard", - new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new)); + preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new)); + preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.singleton("standard", false, StandardFilter::new)); /* Note that "stop" is available in lucene-core but it's pre-built * version uses a set of English stop words that are in * lucene-analyzers-common so "stop" is defined in the analysis-common @@ -288,9 +286,12 @@ public final class AnalysisModule { // This has been migrated but has to stick around until PreBuiltTokenizers is removed. continue; default: + if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) { + throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy()); + } String name = preBuilt.name().toLowerCase(Locale.ROOT); - preConfiguredTokenFilters.register(name, - new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create)); + preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(), + tokenStream -> preBuilt.create(tokenStream, Version.CURRENT))); } } diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 02f6d8aadc5..427c0431fb5 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -30,7 +30,6 @@ import org.apache.lucene.analysis.core.DecimalDigitFilter; import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.de.GermanNormalizationFilter; import org.apache.lucene.analysis.de.GermanStemFilter; -import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.hi.HindiNormalizationFilter; @@ -70,20 +69,6 @@ public enum PreBuiltTokenFilters { }, // Extended Token Filters - SNOWBALL(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new SnowballFilter(tokenStream, "English"); - } - }, - - STEMMER(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new PorterStemFilter(tokenStream); - } - }, - ELISION(CachingStrategy.ONE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 471d6f9cccc..03329667627 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -34,7 +34,6 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; @@ -207,12 +206,11 @@ public class AnalysisRegistryTests extends ESTestCase { public void testPreConfiguredTokenFiltersAreCached() throws IOException { AtomicBoolean built = new AtomicBoolean(false); - PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false, - PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> { + PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> { if (false == built.compareAndSet(false, true)) { fail("Attempted to build the token filter twice when it should have been cached"); } - return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET); + return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET); }); try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) { diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index 5cdc5894057..a818d9c7178 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -24,7 +24,6 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; @@ -113,7 +112,7 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase { private static class MockAnalysisPlugin implements AnalysisPlugin { @Override public List getPreConfiguredTokenFilters() { - return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new)); + return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new)); } @Override diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 518f669f81f..3ecef3aa0f5 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -32,7 +32,6 @@ import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.mapper.MapperService.MergeReason; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; @@ -55,7 +54,7 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase { public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public List getPreConfiguredTokenFilters() { - return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new)); + return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new)); } }; diff --git a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index 3d479ca2da2..2572b7aeb0f 100644 --- a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; @@ -28,6 +29,7 @@ import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.elasticsearch.Version; @@ -43,6 +45,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -61,17 +64,23 @@ import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Set; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; import static org.hamcrest.Matchers.either; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; public class AnalysisModuleTests extends ESTestCase { + private final Settings emptyNodeSettings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); public IndexAnalyzers getIndexAnalyzers(Settings settings) throws IOException { return getIndexAnalyzers(getNewRegistry(settings), settings); @@ -264,6 +273,71 @@ public class AnalysisModuleTests extends ESTestCase { } } + /** + * Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version, + * and that do not vary based on version at all. + */ + public void testPluginPreConfiguredTokenFilters() throws IOException { + // Simple token filter that appends text to the term + final class AppendTokenFilter extends TokenFilter { + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + private final char[] appendMe; + + protected AppendTokenFilter(TokenStream input, String appendMe) { + super(input); + this.appendMe = appendMe.toCharArray(); + } + + @Override + public boolean incrementToken() throws IOException { + if (false == input.incrementToken()) { + return false; + } + term.resizeBuffer(term.length() + appendMe.length); + System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length); + term.setLength(term.length() + appendMe.length); + return true; + } + } + boolean noVersionSupportsMultiTerm = randomBoolean(); + boolean luceneVersionSupportsMultiTerm = randomBoolean(); + boolean elasticsearchVersionSupportsMultiTerm = randomBoolean(); + AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() { + @Override + public List getPreConfiguredTokenFilters() { + return Arrays.asList( + PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm, + tokenStream -> new AppendTokenFilter(tokenStream, "no_version")), + PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, + (tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())), + PreConfiguredTokenFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm, + (tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString())) + ); + } + })).getAnalysisRegistry(); + + Version version = VersionUtils.randomVersion(random()); + IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder() + .put("index.analysis.analyzer.no_version.tokenizer", "keyword") + .put("index.analysis.analyzer.no_version.filter", "no_version") + .put("index.analysis.analyzer.lucene_version.tokenizer", "keyword") + .put("index.analysis.analyzer.lucene_version.filter", "lucene_version") + .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword") + .put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version") + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .build()); + assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"}); + assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion}); + assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version}); + + assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), + analyzers.get("no_version").normalize("", "test").utf8ToString()); + assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), + analyzers.get("lucene_version").normalize("", "test").utf8ToString()); + assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""), + analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString()); + } + public void testRegisterHunspellDictionary() throws Exception { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index afe235ac8a5..1261d15ed65 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -36,13 +36,13 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; @@ -73,41 +73,40 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public List getPreConfiguredTokenFilters() { - // TODO we should revisit the caching strategies. List filters = new ArrayList<>(); - filters.add(new PreConfiguredTokenFilter("asciifolding", true, CachingStrategy.ONE, input -> new ASCIIFoldingFilter(input))); - filters.add(new PreConfiguredTokenFilter("classic", false, CachingStrategy.ONE, ClassicFilter::new)); - filters.add(new PreConfiguredTokenFilter("common_grams", false, CachingStrategy.LUCENE, input -> - new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); - filters.add(new PreConfiguredTokenFilter("edge_ngram", false, CachingStrategy.LUCENE, input -> + filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, + input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); + filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram - filters.add(new PreConfiguredTokenFilter("edgeNGram", false, CachingStrategy.LUCENE, input -> + filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); - filters.add(new PreConfiguredTokenFilter("kstem", false, CachingStrategy.ONE, KStemFilter::new)); - filters.add(new PreConfiguredTokenFilter("length", false, CachingStrategy.LUCENE, input -> + filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless - filters.add(new PreConfiguredTokenFilter("ngram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram - filters.add(new PreConfiguredTokenFilter("nGram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); - filters.add(new PreConfiguredTokenFilter("porter_stem", false, CachingStrategy.ONE, PorterStemFilter::new)); - filters.add(new PreConfiguredTokenFilter("reverse", false, CachingStrategy.LUCENE, input -> new ReverseStringFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); + filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common - filters.add(new PreConfiguredTokenFilter("stop", false, CachingStrategy.LUCENE, input -> - new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); - filters.add(new PreConfiguredTokenFilter("trim", false, CachingStrategy.LUCENE, TrimFilter::new)); - filters.add(new PreConfiguredTokenFilter("truncate", false, CachingStrategy.ONE, input -> - new TruncateTokenFilter(input, 10))); - filters.add(new PreConfiguredTokenFilter("unique", false, CachingStrategy.ONE, input -> new UniqueTokenFilter(input))); - filters.add(new PreConfiguredTokenFilter("uppercase", true, CachingStrategy.LUCENE, UpperCaseFilter::new)); - filters.add(new PreConfiguredTokenFilter("word_delimiter", false, CachingStrategy.ONE, input -> + filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); + filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); + filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input))); + filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); - filters.add(new PreConfiguredTokenFilter("word_delimiter_graph", false, CachingStrategy.ONE, input -> + filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 73a6c3d2732..cf78f6646a2 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -19,7 +19,9 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.en.PorterStemFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; +import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; @@ -77,6 +79,8 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { filters.put("nGram", null); filters.put("porter_stem", null); filters.put("reverse", ReverseStringFilterFactory.class); + filters.put("snowball", SnowballPorterFilterFactory.class); + filters.put("stemmer", PorterStemFilterFactory.class); filters.put("stop", null); filters.put("trim", null); filters.put("truncate", null); diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 534db0be39f..fe22734d974 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -350,15 +350,11 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { case LOWERCASE: // This has been migrated but has to stick around until PreBuiltTokenizers is removed. continue; - case SNOWBALL: case DUTCH_STEM: case FRENCH_STEM: case RUSSIAN_STEM: luceneFactoryClass = SnowballPorterFilterFactory.class; break; - case STEMMER: - luceneFactoryClass = PorterStemFilterFactory.class; - break; case DELIMITED_PAYLOAD_FILTER: luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class; break;