From a646f85a999c087668ef088387f1b918f4f3a8a3 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 29 Nov 2018 10:35:38 +0000 Subject: [PATCH] Ensure TokenFilters only produce single tokens when parsing synonyms (#34331) A number of tokenfilters can produce multiple tokens at the same position. This is a problem when using token chains to parse synonym files, as the SynonymMap requires that there are no stacked tokens in its input. This commit ensures that when used to parse synonyms, these tokenfilters either produce a single version of their input token, or that they throw an error when mappings are generated. In indexes created in elasticsearch 6.x deprecation warnings are emitted in place of the error. * asciifolding and cjk_bigram produce only the folded or bigrammed token * decompounders, synonyms and keyword_repeat are skipped * n-grams, word-delimiter-filter, multiplexer, fingerprint and phonetic throw errors Fixes #34298 --- .../synonym-graph-tokenfilter.asciidoc | 12 ++ .../tokenfilters/synonym-tokenfilter.asciidoc | 13 ++ .../ASCIIFoldingTokenFilterFactory.java | 21 ++ ...bstractCompoundWordTokenFilterFactory.java | 6 + .../common/CJKBigramFilterFactory.java | 21 ++ .../analysis/common/CommonAnalysisPlugin.java | 2 +- .../common/CommonGramsTokenFilterFactory.java | 19 ++ .../common/EdgeNGramTokenFilterFactory.java | 19 ++ .../common/FingerprintTokenFilterFactory.java | 19 ++ .../common/MultiplexerTokenFilterFactory.java | 35 +++- .../common/NGramTokenFilterFactory.java | 20 +- .../SynonymGraphTokenFilterFactory.java | 2 +- .../common/SynonymTokenFilterFactory.java | 19 +- .../WordDelimiterGraphTokenFilterFactory.java | 19 ++ .../WordDelimiterTokenFilterFactory.java | 19 ++ .../common/SynonymsAnalysisTests.java | 183 +++++++++++++++++- .../analysis/PhoneticTokenFilterFactory.java | 19 ++ .../AnalysisPhoneticFactoryTests.java | 36 ++++ .../analysis/PreConfiguredTokenFilter.java | 37 +++- .../analysis/ShingleTokenFilterFactory.java | 22 ++- .../index/analysis/TokenFilterFactory.java | 3 +- 21 files changed, 529 insertions(+), 17 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index 8be5647e10f..2a555d7d044 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -175,3 +175,15 @@ PUT /test_index Using `synonyms_path` to define WordNet synonyms in a file is supported as well. + +=== Parsing synonym files + +Elasticsearch will use the token filters preceding the synonym filter +in a tokenizer chain to parse the entries in a synonym file. So, for example, if a +synonym filter is placed after a stemmer, then the stemmer will also be applied +to the synonym entries. Because entries in the synonym map cannot have stacked +positions, some token filters may cause issues here. Token filters that produce +multiple versions of a token may choose which version of the token to emit when +parsing synonyms, e.g. `asciifolding` will only produce the folded version of the +token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an +error. diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index 2d8fa93147a..d0659f3425d 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -163,3 +163,16 @@ PUT /test_index Using `synonyms_path` to define WordNet synonyms in a file is supported as well. + + +=== Parsing synonym files + +Elasticsearch will use the token filters preceding the synonym filter +in a tokenizer chain to parse the entries in a synonym file. So, for example, if a +synonym filter is placed after a stemmer, then the stemmer will also be applied +to the synonym entries. Because entries in the synonym map cannot have stacked +positions, some token filters may cause issues here. Token filters that produce +multiple versions of a token may choose which version of the token to emit when +parsing synonyms, e.g. `asciifolding` will only produce the folded version of the +token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an +error. \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java index 83e71d5d858..24fed9ae5ab 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java @@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; /** * Factory for ASCIIFoldingFilter. @@ -51,8 +52,28 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory } @Override + public TokenFilterFactory getSynonymFilter() { + if (preserveOriginal == false) { + return this; + } else { + // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning + return new TokenFilterFactory() { + @Override + public String name() { + return ASCIIFoldingTokenFilterFactory.this.name(); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ASCIIFoldingFilter(tokenStream, false); + } + }; + } + } + public TokenStream normalize(TokenStream tokenStream) { // Normalization should only emit a single token, so always turn off preserveOriginal return new ASCIIFoldingFilter(tokenStream, false); } + } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java index 92d32c57150..f061ff4dc22 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java @@ -26,6 +26,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; /** * Contains the common configuration settings between subclasses of this class. @@ -50,4 +51,9 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly"); } } + + @Override + public TokenFilterFactory getSynonymFilter() { + return IDENTITY_FILTER; // don't decompound synonym file + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java index be1f2495f0b..a794c409e4e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java @@ -19,13 +19,17 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.Arrays; import java.util.HashSet; @@ -48,6 +52,9 @@ import java.util.Set; */ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(CJKBigramFilterFactory.class)); + private final int flags; private final boolean outputUnigrams; @@ -89,4 +96,18 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory { return filter; } + @Override + public TokenFilterFactory getSynonymFilter() { + if (outputUnigrams) { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + } + } + return this; + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index a394efdfeb6..ad0e59c2767 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -426,7 +426,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java index 8de6dcacb73..933a02f74e7 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java @@ -19,18 +19,25 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(CommonGramsTokenFilterFactory.class)); + private final CharArraySet words; private final boolean ignoreCase; @@ -58,5 +65,17 @@ public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory { return filter; } } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + } + + return this; + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java index 6bcd2b737fe..128b3d1cf82 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java @@ -19,17 +19,24 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class)); + private final int minGram; private final int maxGram; @@ -77,4 +84,16 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { public boolean breaksFastVectorHighlighter() { return true; } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java index f41fb1207c6..433fa4d7dee 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java @@ -19,18 +19,25 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE; import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE; public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class)); + private final char separator; private final int maxOutputSize; @@ -47,4 +54,16 @@ public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory { return result; } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } + } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java index c3e0d5133c3..9c53fc1f63e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java @@ -19,12 +19,15 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.elasticsearch.Version; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -40,6 +43,9 @@ import java.util.function.Function; public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class)); + private List filterNames; private final boolean preserveOriginal; @@ -54,6 +60,22 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first"); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + if (preserveOriginal) { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return IDENTITY_FILTER; + } + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms unless [preserve_original] is [true]"); + } + } + @Override public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, List previousTokenFilters, @@ -98,7 +120,18 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenFilterFactory getSynonymFilter() { - return IDENTITY_FILTER; + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + if (preserveOriginal) { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return IDENTITY_FILTER; + } + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms unless [preserve_original] is [true]"); + } } }; } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java index 72649239a30..6abf2cbd37c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java @@ -19,23 +19,27 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.Version; - +import org.elasticsearch.index.analysis.TokenFilterFactory; public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(NGramTokenFilterFactory.class)); + private final int minGram; private final int maxGram; - NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff(); @@ -60,4 +64,16 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { // TODO: Expose preserveOriginal return new NGramTokenFilter(tokenStream, minGram, maxGram, false); } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java index cccfdc8d7b7..e4fd18bcba6 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java @@ -49,7 +49,7 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, List previousTokenFilters, Function allFilters) { - final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters); final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); final String name = name(); return new TokenFilterFactory() { diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java index 99810432cd8..75d4eca4254 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java @@ -19,10 +19,12 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -40,6 +42,9 @@ import java.util.function.Function; public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(SynonymTokenFilterFactory.class)); + private final String format; private final boolean expand; private final boolean lenient; @@ -52,7 +57,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { this.settings = settings; if (settings.get("ignore_case") != null) { - deprecationLogger.deprecated( + DEPRECATION_LOGGER.deprecated( "The ignore_case option on the synonym_graph filter is deprecated. " + "Instead, insert a lowercase filter in the filter chain before the synonym_graph filter."); } @@ -72,7 +77,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, List previousTokenFilters, Function allFilters) { - final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters); final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); final String name = name(); return new TokenFilterFactory() { @@ -85,11 +90,19 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false); } + + @Override + public TokenFilterFactory getSynonymFilter() { + // In order to allow chained synonym filters, we return IDENTITY here to + // ensure that synonyms don't get applied to the synonym map itself, + // which doesn't support stacked input tokens + return IDENTITY_FILTER; + } }; } Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List charFilters, - List tokenFilters) { + List tokenFilters, Function allFilters) { return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]), tokenFilters.stream() .map(TokenFilterFactory::getSynonymFilter) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index 6173cfdc84a..6948eaf01e4 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -19,15 +19,19 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.List; import java.util.Set; @@ -45,6 +49,9 @@ import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory. public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(WordDelimiterGraphTokenFilterFactory.class)); + private final byte[] charTypeTable; private final int flags; private final CharArraySet protoWords; @@ -95,6 +102,18 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac return new WordDelimiterGraphFilter(tokenStream, charTypeTable, flags, protoWords); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } + private int getFlag(int flag, Settings settings, String key, boolean defaultValue) { if (settings.getAsBoolean(key, defaultValue)) { return flag; diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java index 93677d0898f..85f94a86c6c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java @@ -19,15 +19,19 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.Collection; import java.util.List; @@ -50,6 +54,9 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(WordDelimiterTokenFilterFactory.class)); + private final byte[] charTypeTable; private final int flags; private final CharArraySet protoWords; @@ -103,6 +110,18 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory protoWords); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } + public int getFlag(int flag, Settings settings, String key, boolean defaultValue) { if (settings.getAsBoolean(key, defaultValue)) { return flag; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java index 942b4876077..a63dd975688 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java @@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; @@ -29,14 +30,20 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; import org.hamcrest.MatcherAssert; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -118,7 +125,7 @@ public class SynonymsAnalysisTests extends ESTestCase { } } - public void testSynonymsWithMultiplexer() throws IOException { + public void testSynonymsWrappedByMultiplexer() throws IOException { Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir().toString()) @@ -139,6 +146,180 @@ public class SynonymsAnalysisTests extends ESTestCase { new int[]{ 1, 1, 0, 0, 1, 1 }); } + public void testAsciiFoldingFilterForSynonyms() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "hoj, height") + .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "asciifolding", "synonyms") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj", + new String[]{ "hoj", "height" }, + new int[]{ 1, 0 }); + } + + public void testKeywordRepeatAndSynonyms() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer") + .put("index.analysis.filter.my_english.type", "stemmer") + .put("index.analysis.filter.my_english.language", "porter2") + .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers", + new String[]{ "programmers", "programm", "develop" }, + new int[]{ 1, 0, 0 }); + } + + public void testChainedSynonymFilters() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms1.type", "synonym") + .putList("index.analysis.filter.synonyms1.synonyms", "term1, term2") + .put("index.analysis.filter.synonyms2.type", "synonym") + .putList("index.analysis.filter.synonyms2.synonyms", "term1, term3") + .put("index.analysis.analyzer.syn.tokenizer", "standard") + .putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("syn"), "term1", + new String[]{ "term1", "term3", "term2" }, new int[]{ 1, 0, 0 }); + } + + public void testShingleFilters() { + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer") + .put("index.analysis.filter.my_shingle.type", "shingle") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "my_shingle", "synonyms") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + expectThrows(IllegalArgumentException.class, () -> { + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + }); + + } + + public void testTokenFiltersBypassSynonymAnalysis() throws IOException { + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .putList("word_list", "a") + .put("hyphenation_patterns_path", "foo") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + String[] bypassingFactories = new String[]{ + "dictionary_decompounder" + }; + + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + for (String factory : bypassingFactories) { + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); + + try (TokenStream ts = analyzer.tokenStream("field", "text")) { + assertThat(ts, instanceOf(KeywordTokenizer.class)); + } + } + + } + + public void testDisallowedTokenFilters() throws IOException { + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .putList("common_words", "a", "b") + .put("output_unigrams", "true") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + + String[] disallowedFactories = new String[]{ + "multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram", + "word_delimiter", "word_delimiter_graph", "fingerprint" + }; + + for (String factory : disallowedFactories) { + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + "Expected IllegalArgumentException for factory " + factory, + () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)); + + assertEquals(factory, "Token filter [" + factory + + "] cannot be used to parse synonyms", + e.getMessage()); + } + + settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .putList("common_words", "a", "b") + .put("output_unigrams", "true") + .build(); + idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + List expectedWarnings = new ArrayList<>(); + for (String factory : disallowedFactories) { + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + + stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); + expectedWarnings.add("Token filter [" + factory + + "] will not be usable to parse synonyms after v7.0"); + } + + assertWarnings(expectedWarnings.toArray(new String[0])); + + settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .put("preserve_original", "false") + .build(); + idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + TokenFilterFactory tff = plugin.getTokenFilters().get("multiplexer").get(idxSettings, null, "multiplexer", settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)); + + assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]", + e.getMessage()); + + } + private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java index d7b619cc311..1022b826475 100644 --- a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java @@ -30,11 +30,14 @@ import org.apache.commons.codec.language.bm.Languages.LanguageSet; import org.apache.commons.codec.language.bm.NameType; import org.apache.commons.codec.language.bm.PhoneticEngine; import org.apache.commons.codec.language.bm.RuleType; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.phonetic.BeiderMorseFilter; import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter; import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; import org.apache.lucene.analysis.phonetic.PhoneticFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -47,6 +50,10 @@ import java.util.List; public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { + + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(PhoneticTokenFilterFactory.class)); + private final Encoder encoder; private final boolean replace; private int maxcodelength; @@ -138,4 +145,16 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { } throw new IllegalArgumentException("encoder error"); } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } } diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java index 8c551aee919..2092d63fd23 100644 --- a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java @@ -19,9 +19,16 @@ package org.elasticsearch.index.analysis; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; +import java.io.IOException; import java.util.HashMap; import java.util.Map; @@ -38,4 +45,33 @@ public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase { filters.put("phonetic", PhoneticTokenFilterFactory.class); return filters; } + + public void testDisallowedWithSynonyms() throws IOException { + + AnalysisPhoneticPlugin plugin = new AnalysisPhoneticPlugin(); + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + TokenFilterFactory tff + = plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, tff::getSynonymFilter); + assertEquals("Token filter [phonetic] cannot be used to parse synonyms", e.getMessage()); + + settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), + Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .build(); + idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + tff = plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings); + tff.getSynonymFilter(); + + assertWarnings("Token filter [phonetic] will not be usable to parse synonyms after v7.0"); + } + } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java b/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java index bd70d929555..123802c9510 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -37,16 +37,26 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, Function create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream)); } + /** + * Create a pre-configured token filter that may not vary at all. + */ + public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, + boolean useFilterForParsingSynonyms, + Function create) { + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE, + (tokenStream, version) -> create.apply(tokenStream)); + } + /** * Create a pre-configured token filter that may not vary at all. */ public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream, version)); } @@ -55,7 +65,7 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE, (tokenStream, version) -> create.apply(tokenStream, version.luceneVersion)); } @@ -64,16 +74,18 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create); + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create); } private final boolean useFilterForMultitermQueries; + private final boolean useFilterForParsingSynonyms; private final BiFunction create; - private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms, PreBuiltCacheFactory.CachingStrategy cache, BiFunction create) { super(name, cache); this.useFilterForMultitermQueries = useFilterForMultitermQueries; + this.useFilterForParsingSynonyms = useFilterForParsingSynonyms; this.create = create; } @@ -104,6 +116,13 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone return create.apply(tokenStream, version); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (useFilterForParsingSynonyms) { + return this; + } + return IDENTITY_FILTER; + } }; } return new TokenFilterFactory() { @@ -116,6 +135,14 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone public TokenStream create(TokenStream tokenStream) { return create.apply(tokenStream, version); } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (useFilterForParsingSynonyms) { + return this; + } + return IDENTITY_FILTER; + } }; } } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java index 6ab9a6d51cf..37a163eac97 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java @@ -19,16 +19,21 @@ package org.elasticsearch.index.analysis; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(ShingleTokenFilterFactory.class)); + private final Factory factory; public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { @@ -54,8 +59,8 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); - factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, - tokenSeparator, fillerToken); + factory = new Factory("shingle", minShingleSize, maxShingleSize, + outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); } @@ -64,6 +69,19 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { return factory.create(tokenStream); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter " + name() + + "] will not be usable to parse synonym after v7.0"); + } + return this; + + } public Factory getInnerFactory() { return this.factory; diff --git a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java index a400755c860..b7ed6fd9e9e 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java @@ -67,7 +67,8 @@ public interface TokenFilterFactory { * Return a version of this TokenFilterFactory appropriate for synonym parsing * * Filters that should not be applied to synonyms (for example, those that produce - * multiple tokens) can return {@link #IDENTITY_FILTER} + * multiple tokens) should throw an exception + * */ default TokenFilterFactory getSynonymFilter() { return this;