diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index ea0b69c678b..94f5de8278f 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -401,7 +401,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, + filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); @@ -422,9 +422,9 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); - filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> + filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1))); - filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, (reader, version) -> { + filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> { if (version.onOrAfter(org.elasticsearch.Version.V_7_0_0)) { throw new IllegalArgumentException( "The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " @@ -451,8 +451,8 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS))); - filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false))); - filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, (reader, version) -> { + filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false))); + filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> { if (version.onOrAfter(org.elasticsearch.Version.V_7_0_0)) { throw new IllegalArgumentException("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " + "Please change the filter name to [ngram] instead."); @@ -469,7 +469,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { + filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream @@ -491,14 +491,14 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> + filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); - filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, (input, version) -> { + filters.add(PreConfiguredTokenFilter.singletonWithVersion("word_delimiter_graph", false, false, (input, version) -> { boolean adjustOffsets = version.onOrAfter(Version.V_7_3_0); return new WordDelimiterGraphFilter(input, adjustOffsets, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, WordDelimiterGraphFilter.GENERATE_WORD_PARTS diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java index a63dd975688..6582188f33c 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java @@ -30,6 +30,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.test.ESTestCase; @@ -42,8 +43,11 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.List; +import java.util.Set; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -163,23 +167,21 @@ public class SynonymsAnalysisTests extends ESTestCase { new int[]{ 1, 0 }); } - public void testKeywordRepeatAndSynonyms() throws IOException { + public void testPreconfigured() throws IOException { Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir().toString()) .put("index.analysis.filter.synonyms.type", "synonym") - .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer") - .put("index.analysis.filter.my_english.type", "stemmer") - .put("index.analysis.filter.my_english.language", "porter2") - .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard") - .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms") + .putList("index.analysis.filter.synonyms.synonyms", "würst, sausage") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "lowercase", "asciifolding", "synonyms") .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; - BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers", - new String[]{ "programmers", "programm", "develop" }, - new int[]{ 1, 0, 0 }); + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("my_analyzer"), "würst", + new String[]{ "wurst", "sausage"}, + new int[]{ 1, 0 }); } public void testChainedSynonymFilters() throws IOException { @@ -248,6 +250,58 @@ public class SynonymsAnalysisTests extends ESTestCase { } + public void testPreconfiguredTokenFilters() throws IOException { + Set disallowedFilters = new HashSet<>(Arrays.asList( + "common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram", + "shingle", "word_delimiter", "word_delimiter_graph" + )); + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + + for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) { + if (disallowedFilters.contains(tf.getName())) { + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + "Expected exception for factory " + tf.getName(), () -> { + tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter(); + }); + assertEquals(tf.getName(), "Token filter [" + tf.getName() + + "] cannot be used to parse synonyms", + e.getMessage()); + } + else { + tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter(); + } + } + + Settings settings2 = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .putList("common_words", "a", "b") + .put("output_unigrams", "true") + .build(); + IndexSettings idxSettings2 = IndexSettingsModule.newIndexSettings("index", settings2); + + List expectedWarnings = new ArrayList<>(); + for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) { + if (disallowedFilters.contains(tf.getName())) { + tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter(); + expectedWarnings.add("Token filter [" + tf.getName() + "] will not be usable to parse synonyms after v7.0"); + } + else { + tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter(); + } + } + assertWarnings(expectedWarnings.toArray(new String[0])); + } + public void testDisallowedTokenFilters() throws IOException { Settings settings = Settings.builder() diff --git a/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java b/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java index 123802c9510..5776edd69fc 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -19,9 +19,11 @@ package org.elasticsearch.index.analysis; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; @@ -32,12 +34,16 @@ import java.util.function.Function; * Provides pre-configured, shared {@link TokenFilter}s. */ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent { + + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(PreConfiguredTokenFilter.class)); + /** * Create a pre-configured token filter that may not vary at all. */ public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, Function create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream)); } @@ -45,27 +51,37 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone * Create a pre-configured token filter that may not vary at all. */ public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, - boolean useFilterForParsingSynonyms, + boolean allowForSynonymParsing, Function create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, allowForSynonymParsing, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream)); } /** - * Create a pre-configured token filter that may not vary at all. + * Create a pre-configured token filter that may vary based on the Elasticsearch version. */ public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream, version)); } + /** + * Create a pre-configured token filter that may vary based on the Elasticsearch version. + */ + public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries, + boolean useFilterForParsingSynonyms, + BiFunction create) { + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE, + (tokenStream, version) -> create.apply(tokenStream, version)); + } + /** * Create a pre-configured token filter that may vary based on the Lucene version. */ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.LUCENE, (tokenStream, version) -> create.apply(tokenStream, version.luceneVersion)); } @@ -74,18 +90,18 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create); + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ELASTICSEARCH, create); } private final boolean useFilterForMultitermQueries; - private final boolean useFilterForParsingSynonyms; + private final boolean allowForSynonymParsing; private final BiFunction create; - private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms, + private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean allowForSynonymParsing, PreBuiltCacheFactory.CachingStrategy cache, BiFunction create) { super(name, cache); this.useFilterForMultitermQueries = useFilterForMultitermQueries; - this.useFilterForParsingSynonyms = useFilterForParsingSynonyms; + this.allowForSynonymParsing = allowForSynonymParsing; this.create = create; } @@ -118,10 +134,17 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone @Override public TokenFilterFactory getSynonymFilter() { - if (useFilterForParsingSynonyms) { + if (allowForSynonymParsing) { + return this; + } + if (version.onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); return this; } - return IDENTITY_FILTER; } }; } @@ -138,10 +161,17 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone @Override public TokenFilterFactory getSynonymFilter() { - if (useFilterForParsingSynonyms) { + if (allowForSynonymParsing) { + return this; + } + if (version.onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); return this; } - return IDENTITY_FILTER; } }; }