diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index 8be5647e10f..2a555d7d044 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -175,3 +175,15 @@ PUT /test_index Using `synonyms_path` to define WordNet synonyms in a file is supported as well. + +=== Parsing synonym files + +Elasticsearch will use the token filters preceding the synonym filter +in a tokenizer chain to parse the entries in a synonym file. So, for example, if a +synonym filter is placed after a stemmer, then the stemmer will also be applied +to the synonym entries. Because entries in the synonym map cannot have stacked +positions, some token filters may cause issues here. Token filters that produce +multiple versions of a token may choose which version of the token to emit when +parsing synonyms, e.g. `asciifolding` will only produce the folded version of the +token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an +error. diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index 2d8fa93147a..d0659f3425d 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -163,3 +163,16 @@ PUT /test_index Using `synonyms_path` to define WordNet synonyms in a file is supported as well. + + +=== Parsing synonym files + +Elasticsearch will use the token filters preceding the synonym filter +in a tokenizer chain to parse the entries in a synonym file. So, for example, if a +synonym filter is placed after a stemmer, then the stemmer will also be applied +to the synonym entries. Because entries in the synonym map cannot have stacked +positions, some token filters may cause issues here. Token filters that produce +multiple versions of a token may choose which version of the token to emit when +parsing synonyms, e.g. `asciifolding` will only produce the folded version of the +token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an +error. \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java index 83e71d5d858..24fed9ae5ab 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java @@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; /** * Factory for ASCIIFoldingFilter. @@ -51,8 +52,28 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory } @Override + public TokenFilterFactory getSynonymFilter() { + if (preserveOriginal == false) { + return this; + } else { + // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning + return new TokenFilterFactory() { + @Override + public String name() { + return ASCIIFoldingTokenFilterFactory.this.name(); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ASCIIFoldingFilter(tokenStream, false); + } + }; + } + } + public TokenStream normalize(TokenStream tokenStream) { // Normalization should only emit a single token, so always turn off preserveOriginal return new ASCIIFoldingFilter(tokenStream, false); } + } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java index 92d32c57150..f061ff4dc22 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java @@ -26,6 +26,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; /** * Contains the common configuration settings between subclasses of this class. @@ -50,4 +51,9 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly"); } } + + @Override + public TokenFilterFactory getSynonymFilter() { + return IDENTITY_FILTER; // don't decompound synonym file + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java index be1f2495f0b..a794c409e4e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java @@ -19,13 +19,17 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.Arrays; import java.util.HashSet; @@ -48,6 +52,9 @@ import java.util.Set; */ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(CJKBigramFilterFactory.class)); + private final int flags; private final boolean outputUnigrams; @@ -89,4 +96,18 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory { return filter; } + @Override + public TokenFilterFactory getSynonymFilter() { + if (outputUnigrams) { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + } + } + return this; + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index a394efdfeb6..ad0e59c2767 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -426,7 +426,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); - filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java index 8de6dcacb73..933a02f74e7 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java @@ -19,18 +19,25 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(CommonGramsTokenFilterFactory.class)); + private final CharArraySet words; private final boolean ignoreCase; @@ -58,5 +65,17 @@ public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory { return filter; } } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + } + + return this; + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java index 6bcd2b737fe..128b3d1cf82 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java @@ -19,17 +19,24 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class)); + private final int minGram; private final int maxGram; @@ -77,4 +84,16 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { public boolean breaksFastVectorHighlighter() { return true; } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java index f41fb1207c6..433fa4d7dee 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java @@ -19,18 +19,25 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE; import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE; public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class)); + private final char separator; private final int maxOutputSize; @@ -47,4 +54,16 @@ public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory { return result; } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } + } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java index c3e0d5133c3..9c53fc1f63e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java @@ -19,12 +19,15 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.elasticsearch.Version; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -40,6 +43,9 @@ import java.util.function.Function; public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class)); + private List filterNames; private final boolean preserveOriginal; @@ -54,6 +60,22 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first"); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + if (preserveOriginal) { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return IDENTITY_FILTER; + } + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms unless [preserve_original] is [true]"); + } + } + @Override public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, List previousTokenFilters, @@ -98,7 +120,18 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenFilterFactory getSynonymFilter() { - return IDENTITY_FILTER; + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + if (preserveOriginal) { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return IDENTITY_FILTER; + } + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms unless [preserve_original] is [true]"); + } } }; } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java index 72649239a30..6abf2cbd37c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java @@ -19,23 +19,27 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.Version; - +import org.elasticsearch.index.analysis.TokenFilterFactory; public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(NGramTokenFilterFactory.class)); + private final int minGram; private final int maxGram; - NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff(); @@ -60,4 +64,16 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { // TODO: Expose preserveOriginal return new NGramTokenFilter(tokenStream, minGram, maxGram, false); } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java index cccfdc8d7b7..e4fd18bcba6 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java @@ -49,7 +49,7 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, List previousTokenFilters, Function allFilters) { - final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters); final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); final String name = name(); return new TokenFilterFactory() { diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java index 99810432cd8..75d4eca4254 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java @@ -19,10 +19,12 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -40,6 +42,9 @@ import java.util.function.Function; public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(SynonymTokenFilterFactory.class)); + private final String format; private final boolean expand; private final boolean lenient; @@ -52,7 +57,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { this.settings = settings; if (settings.get("ignore_case") != null) { - deprecationLogger.deprecated( + DEPRECATION_LOGGER.deprecated( "The ignore_case option on the synonym_graph filter is deprecated. " + "Instead, insert a lowercase filter in the filter chain before the synonym_graph filter."); } @@ -72,7 +77,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, List previousTokenFilters, Function allFilters) { - final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters); final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); final String name = name(); return new TokenFilterFactory() { @@ -85,11 +90,19 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false); } + + @Override + public TokenFilterFactory getSynonymFilter() { + // In order to allow chained synonym filters, we return IDENTITY here to + // ensure that synonyms don't get applied to the synonym map itself, + // which doesn't support stacked input tokens + return IDENTITY_FILTER; + } }; } Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List charFilters, - List tokenFilters) { + List tokenFilters, Function allFilters) { return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]), tokenFilters.stream() .map(TokenFilterFactory::getSynonymFilter) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index 6173cfdc84a..6948eaf01e4 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -19,15 +19,19 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.List; import java.util.Set; @@ -45,6 +49,9 @@ import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory. public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(WordDelimiterGraphTokenFilterFactory.class)); + private final byte[] charTypeTable; private final int flags; private final CharArraySet protoWords; @@ -95,6 +102,18 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac return new WordDelimiterGraphFilter(tokenStream, charTypeTable, flags, protoWords); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } + private int getFlag(int flag, Settings settings, String key, boolean defaultValue) { if (settings.getAsBoolean(key, defaultValue)) { return flag; diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java index 93677d0898f..85f94a86c6c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java @@ -19,15 +19,19 @@ package org.elasticsearch.analysis.common; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.Collection; import java.util.List; @@ -50,6 +54,9 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(WordDelimiterTokenFilterFactory.class)); + private final byte[] charTypeTable; private final int flags; private final CharArraySet protoWords; @@ -103,6 +110,18 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory protoWords); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } + public int getFlag(int flag, Settings settings, String key, boolean defaultValue) { if (settings.getAsBoolean(key, defaultValue)) { return flag; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java index 942b4876077..a63dd975688 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java @@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; @@ -29,14 +30,20 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; import org.hamcrest.MatcherAssert; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -118,7 +125,7 @@ public class SynonymsAnalysisTests extends ESTestCase { } } - public void testSynonymsWithMultiplexer() throws IOException { + public void testSynonymsWrappedByMultiplexer() throws IOException { Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", createTempDir().toString()) @@ -139,6 +146,180 @@ public class SynonymsAnalysisTests extends ESTestCase { new int[]{ 1, 1, 0, 0, 1, 1 }); } + public void testAsciiFoldingFilterForSynonyms() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "hoj, height") + .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "asciifolding", "synonyms") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj", + new String[]{ "hoj", "height" }, + new int[]{ 1, 0 }); + } + + public void testKeywordRepeatAndSynonyms() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer") + .put("index.analysis.filter.my_english.type", "stemmer") + .put("index.analysis.filter.my_english.language", "porter2") + .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers", + new String[]{ "programmers", "programm", "develop" }, + new int[]{ 1, 0, 0 }); + } + + public void testChainedSynonymFilters() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms1.type", "synonym") + .putList("index.analysis.filter.synonyms1.synonyms", "term1, term2") + .put("index.analysis.filter.synonyms2.type", "synonym") + .putList("index.analysis.filter.synonyms2.synonyms", "term1, term3") + .put("index.analysis.analyzer.syn.tokenizer", "standard") + .putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("syn"), "term1", + new String[]{ "term1", "term3", "term2" }, new int[]{ 1, 0, 0 }); + } + + public void testShingleFilters() { + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer") + .put("index.analysis.filter.my_shingle.type", "shingle") + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.my_analyzer.filter", "my_shingle", "synonyms") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + expectThrows(IllegalArgumentException.class, () -> { + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + }); + + } + + public void testTokenFiltersBypassSynonymAnalysis() throws IOException { + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .putList("word_list", "a") + .put("hyphenation_patterns_path", "foo") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + String[] bypassingFactories = new String[]{ + "dictionary_decompounder" + }; + + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + for (String factory : bypassingFactories) { + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); + + try (TokenStream ts = analyzer.tokenStream("field", "text")) { + assertThat(ts, instanceOf(KeywordTokenizer.class)); + } + } + + } + + public void testDisallowedTokenFilters() throws IOException { + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .putList("common_words", "a", "b") + .put("output_unigrams", "true") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + + String[] disallowedFactories = new String[]{ + "multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram", + "word_delimiter", "word_delimiter_graph", "fingerprint" + }; + + for (String factory : disallowedFactories) { + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + "Expected IllegalArgumentException for factory " + factory, + () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)); + + assertEquals(factory, "Token filter [" + factory + + "] cannot be used to parse synonyms", + e.getMessage()); + } + + settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .putList("common_words", "a", "b") + .put("output_unigrams", "true") + .build(); + idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + List expectedWarnings = new ArrayList<>(); + for (String factory : disallowedFactories) { + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + + stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); + expectedWarnings.add("Token filter [" + factory + + "] will not be usable to parse synonyms after v7.0"); + } + + assertWarnings(expectedWarnings.toArray(new String[0])); + + settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, + VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .put("preserve_original", "false") + .build(); + idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + TokenFilterFactory tff = plugin.getTokenFilters().get("multiplexer").get(idxSettings, null, "multiplexer", settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)); + + assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]", + e.getMessage()); + + } + private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java index d7b619cc311..1022b826475 100644 --- a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java @@ -30,11 +30,14 @@ import org.apache.commons.codec.language.bm.Languages.LanguageSet; import org.apache.commons.codec.language.bm.NameType; import org.apache.commons.codec.language.bm.PhoneticEngine; import org.apache.commons.codec.language.bm.RuleType; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.phonetic.BeiderMorseFilter; import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter; import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; import org.apache.lucene.analysis.phonetic.PhoneticFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -47,6 +50,10 @@ import java.util.List; public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { + + private static final DeprecationLogger DEPRECATION_LOGGER + = new DeprecationLogger(LogManager.getLogger(PhoneticTokenFilterFactory.class)); + private final Encoder encoder; private final boolean replace; private int maxcodelength; @@ -138,4 +145,16 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { } throw new IllegalArgumentException("encoder error"); } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name() + + "] will not be usable to parse synonyms after v7.0"); + return this; + } + } } diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java index 8c551aee919..2092d63fd23 100644 --- a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java @@ -19,9 +19,16 @@ package org.elasticsearch.index.analysis; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.VersionUtils; +import java.io.IOException; import java.util.HashMap; import java.util.Map; @@ -38,4 +45,33 @@ public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase { filters.put("phonetic", PhoneticTokenFilterFactory.class); return filters; } + + public void testDisallowedWithSynonyms() throws IOException { + + AnalysisPhoneticPlugin plugin = new AnalysisPhoneticPlugin(); + + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT)) + .put("path.home", createTempDir().toString()) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + TokenFilterFactory tff + = plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, tff::getSynonymFilter); + assertEquals("Token filter [phonetic] cannot be used to parse synonyms", e.getMessage()); + + settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), + Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0))) + .put("path.home", createTempDir().toString()) + .build(); + idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + tff = plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings); + tff.getSynonymFilter(); + + assertWarnings("Token filter [phonetic] will not be usable to parse synonyms after v7.0"); + } + } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java b/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java index bd70d929555..123802c9510 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -37,16 +37,26 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, Function create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream)); } + /** + * Create a pre-configured token filter that may not vary at all. + */ + public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries, + boolean useFilterForParsingSynonyms, + Function create) { + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE, + (tokenStream, version) -> create.apply(tokenStream)); + } + /** * Create a pre-configured token filter that may not vary at all. */ public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE, (tokenStream, version) -> create.apply(tokenStream, version)); } @@ -55,7 +65,7 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE, + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE, (tokenStream, version) -> create.apply(tokenStream, version.luceneVersion)); } @@ -64,16 +74,18 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone */ public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create); + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create); } private final boolean useFilterForMultitermQueries; + private final boolean useFilterForParsingSynonyms; private final BiFunction create; - private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms, PreBuiltCacheFactory.CachingStrategy cache, BiFunction create) { super(name, cache); this.useFilterForMultitermQueries = useFilterForMultitermQueries; + this.useFilterForParsingSynonyms = useFilterForParsingSynonyms; this.create = create; } @@ -104,6 +116,13 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone return create.apply(tokenStream, version); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (useFilterForParsingSynonyms) { + return this; + } + return IDENTITY_FILTER; + } }; } return new TokenFilterFactory() { @@ -116,6 +135,14 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone public TokenStream create(TokenStream tokenStream) { return create.apply(tokenStream, version); } + + @Override + public TokenFilterFactory getSynonymFilter() { + if (useFilterForParsingSynonyms) { + return this; + } + return IDENTITY_FILTER; + } }; } } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java index 6ab9a6d51cf..37a163eac97 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java @@ -19,16 +19,21 @@ package org.elasticsearch.index.analysis; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(LogManager.getLogger(ShingleTokenFilterFactory.class)); + private final Factory factory; public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { @@ -54,8 +59,8 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); - factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, - tokenSeparator, fillerToken); + factory = new Factory("shingle", minShingleSize, maxShingleSize, + outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); } @@ -64,6 +69,19 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { return factory.create(tokenStream); } + @Override + public TokenFilterFactory getSynonymFilter() { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) { + throw new IllegalArgumentException("Token filter [" + name() + + "] cannot be used to parse synonyms"); + } + else { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter " + name() + + "] will not be usable to parse synonym after v7.0"); + } + return this; + + } public Factory getInnerFactory() { return this.factory; diff --git a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java index a400755c860..b7ed6fd9e9e 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java @@ -67,7 +67,8 @@ public interface TokenFilterFactory { * Return a version of this TokenFilterFactory appropriate for synonym parsing * * Filters that should not be applied to synonyms (for example, those that produce - * multiple tokens) can return {@link #IDENTITY_FILTER} + * multiple tokens) should throw an exception + * */ default TokenFilterFactory getSynonymFilter() { return this;