Ensure TokenFilters only produce single tokens when parsing synonyms (#34331)
A number of tokenfilters can produce multiple tokens at the same position. This is a problem when using token chains to parse synonym files, as the SynonymMap requires that there are no stacked tokens in its input. This commit ensures that when used to parse synonyms, these tokenfilters either produce a single version of their input token, or that they throw an error when mappings are generated. In indexes created in elasticsearch 6.x deprecation warnings are emitted in place of the error. * asciifolding and cjk_bigram produce only the folded or bigrammed token * decompounders, synonyms and keyword_repeat are skipped * n-grams, word-delimiter-filter, multiplexer, fingerprint and phonetic throw errors Fixes #34298
This commit is contained in:
parent
c63d0af913
commit
a646f85a99
|
@ -175,3 +175,15 @@ PUT /test_index
|
|||
|
||||
Using `synonyms_path` to define WordNet synonyms in a file is supported
|
||||
as well.
|
||||
|
||||
=== Parsing synonym files
|
||||
|
||||
Elasticsearch will use the token filters preceding the synonym filter
|
||||
in a tokenizer chain to parse the entries in a synonym file. So, for example, if a
|
||||
synonym filter is placed after a stemmer, then the stemmer will also be applied
|
||||
to the synonym entries. Because entries in the synonym map cannot have stacked
|
||||
positions, some token filters may cause issues here. Token filters that produce
|
||||
multiple versions of a token may choose which version of the token to emit when
|
||||
parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
|
||||
token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
|
||||
error.
|
||||
|
|
|
@ -163,3 +163,16 @@ PUT /test_index
|
|||
|
||||
Using `synonyms_path` to define WordNet synonyms in a file is supported
|
||||
as well.
|
||||
|
||||
|
||||
=== Parsing synonym files
|
||||
|
||||
Elasticsearch will use the token filters preceding the synonym filter
|
||||
in a tokenizer chain to parse the entries in a synonym file. So, for example, if a
|
||||
synonym filter is placed after a stemmer, then the stemmer will also be applied
|
||||
to the synonym entries. Because entries in the synonym map cannot have stacked
|
||||
positions, some token filters may cause issues here. Token filters that produce
|
||||
multiple versions of a token may choose which version of the token to emit when
|
||||
parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
|
||||
token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
|
||||
error.
|
|
@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment;
|
|||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for ASCIIFoldingFilter.
|
||||
|
@ -51,8 +52,28 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (preserveOriginal == false) {
|
||||
return this;
|
||||
} else {
|
||||
// See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return ASCIIFoldingTokenFilterFactory.this.name();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ASCIIFoldingFilter(tokenStream, false);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public TokenStream normalize(TokenStream tokenStream) {
|
||||
// Normalization should only emit a single token, so always turn off preserveOriginal
|
||||
return new ASCIIFoldingFilter(tokenStream, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.elasticsearch.env.Environment;
|
|||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Contains the common configuration settings between subclasses of this class.
|
||||
|
@ -50,4 +51,9 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
|
|||
throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
return IDENTITY_FILTER; // don't decompound synonym file
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,13 +19,17 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
@ -48,6 +52,9 @@ import java.util.Set;
|
|||
*/
|
||||
public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(CJKBigramFilterFactory.class));
|
||||
|
||||
private final int flags;
|
||||
private final boolean outputUnigrams;
|
||||
|
||||
|
@ -89,4 +96,18 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
|
|||
return filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (outputUnigrams) {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() +
|
||||
"] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -426,7 +426,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
|
||||
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
|
||||
|
|
|
@ -19,18 +19,25 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(CommonGramsTokenFilterFactory.class));
|
||||
|
||||
private final CharArraySet words;
|
||||
|
||||
private final boolean ignoreCase;
|
||||
|
@ -58,5 +65,17 @@ public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return filter;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
} else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,17 +19,24 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
|
||||
public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class));
|
||||
|
||||
private final int minGram;
|
||||
|
||||
private final int maxGram;
|
||||
|
@ -77,4 +84,16 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public boolean breaksFastVectorHighlighter() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,18 +19,25 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
|
||||
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
|
||||
|
||||
public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class));
|
||||
|
||||
private final char separator;
|
||||
private final int maxOutputSize;
|
||||
|
||||
|
@ -47,4 +54,16 @@ public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -19,12 +19,15 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -40,6 +43,9 @@ import java.util.function.Function;
|
|||
|
||||
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class));
|
||||
|
||||
private List<String> filterNames;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
|
@ -54,6 +60,22 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
if (preserveOriginal) {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
throw new IllegalArgumentException("Token filter [" + name()
|
||||
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
|
@ -98,7 +120,18 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
return IDENTITY_FILTER;
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
if (preserveOriginal) {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
throw new IllegalArgumentException("Token filter [" + name()
|
||||
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -19,23 +19,27 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.Version;
|
||||
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
|
||||
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(NGramTokenFilterFactory.class));
|
||||
|
||||
private final int minGram;
|
||||
|
||||
private final int maxGram;
|
||||
|
||||
|
||||
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
|
||||
|
@ -60,4 +64,16 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
// TODO: Expose preserveOriginal
|
||||
return new NGramTokenFilter(tokenStream, minGram, maxGram, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
|||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
|
||||
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||
final String name = name();
|
||||
return new TokenFilterFactory() {
|
||||
|
|
|
@ -19,10 +19,12 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -40,6 +42,9 @@ import java.util.function.Function;
|
|||
|
||||
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(SynonymTokenFilterFactory.class));
|
||||
|
||||
private final String format;
|
||||
private final boolean expand;
|
||||
private final boolean lenient;
|
||||
|
@ -52,7 +57,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
this.settings = settings;
|
||||
|
||||
if (settings.get("ignore_case") != null) {
|
||||
deprecationLogger.deprecated(
|
||||
DEPRECATION_LOGGER.deprecated(
|
||||
"The ignore_case option on the synonym_graph filter is deprecated. " +
|
||||
"Instead, insert a lowercase filter in the filter chain before the synonym_graph filter.");
|
||||
}
|
||||
|
@ -72,7 +77,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
|
||||
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||
final String name = name();
|
||||
return new TokenFilterFactory() {
|
||||
|
@ -85,11 +90,19 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
// In order to allow chained synonym filters, we return IDENTITY here to
|
||||
// ensure that synonyms don't get applied to the synonym map itself,
|
||||
// which doesn't support stacked input tokens
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> tokenFilters) {
|
||||
List<TokenFilterFactory> tokenFilters, Function<String, TokenFilterFactory> allFilters) {
|
||||
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
|
||||
tokenFilters.stream()
|
||||
.map(TokenFilterFactory::getSynonymFilter)
|
||||
|
|
|
@ -19,15 +19,19 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
@ -45,6 +49,9 @@ import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.
|
|||
|
||||
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER =
|
||||
new DeprecationLogger(LogManager.getLogger(WordDelimiterGraphTokenFilterFactory.class));
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
|
@ -95,6 +102,18 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||
return new WordDelimiterGraphFilter(tokenStream, charTypeTable, flags, protoWords);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
private int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
|
||||
if (settings.getAsBoolean(key, defaultValue)) {
|
||||
return flag;
|
||||
|
|
|
@ -19,15 +19,19 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
@ -50,6 +54,9 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_
|
|||
|
||||
public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER =
|
||||
new DeprecationLogger(LogManager.getLogger(WordDelimiterTokenFilterFactory.class));
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
|
@ -103,6 +110,18 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
protoWords);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
|
||||
if (settings.getAsBoolean(key, defaultValue)) {
|
||||
return flag;
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
|
@ -29,14 +30,20 @@ import org.elasticsearch.common.settings.Settings;
|
|||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
import org.elasticsearch.test.VersionUtils;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
@ -118,7 +125,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testSynonymsWithMultiplexer() throws IOException {
|
||||
public void testSynonymsWrappedByMultiplexer() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
|
@ -139,6 +146,180 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
|||
new int[]{ 1, 1, 0, 0, 1, 1 });
|
||||
}
|
||||
|
||||
public void testAsciiFoldingFilterForSynonyms() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonyms.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms.synonyms", "hoj, height")
|
||||
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "asciifolding", "synonyms")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj",
|
||||
new String[]{ "hoj", "height" },
|
||||
new int[]{ 1, 0 });
|
||||
}
|
||||
|
||||
public void testKeywordRepeatAndSynonyms() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonyms.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
||||
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||
.put("index.analysis.filter.my_english.language", "porter2")
|
||||
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
|
||||
new String[]{ "programmers", "programm", "develop" },
|
||||
new int[]{ 1, 0, 0 });
|
||||
}
|
||||
|
||||
public void testChainedSynonymFilters() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonyms1.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms1.synonyms", "term1, term2")
|
||||
.put("index.analysis.filter.synonyms2.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms2.synonyms", "term1, term3")
|
||||
.put("index.analysis.analyzer.syn.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("syn"), "term1",
|
||||
new String[]{ "term1", "term3", "term2" }, new int[]{ 1, 0, 0 });
|
||||
}
|
||||
|
||||
public void testShingleFilters() {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonyms.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
||||
.put("index.analysis.filter.my_shingle.type", "shingle")
|
||||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.my_analyzer.filter", "my_shingle", "synonyms")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.putList("word_list", "a")
|
||||
.put("hyphenation_patterns_path", "foo")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
String[] bypassingFactories = new String[]{
|
||||
"dictionary_decompounder"
|
||||
};
|
||||
|
||||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
||||
for (String factory : bypassingFactories) {
|
||||
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
||||
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
||||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
|
||||
|
||||
try (TokenStream ts = analyzer.tokenStream("field", "text")) {
|
||||
assertThat(ts, instanceOf(KeywordTokenizer.class));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testDisallowedTokenFilters() throws IOException {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
|
||||
.put("path.home", createTempDir().toString())
|
||||
.putList("common_words", "a", "b")
|
||||
.put("output_unigrams", "true")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
||||
|
||||
String[] disallowedFactories = new String[]{
|
||||
"multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram",
|
||||
"word_delimiter", "word_delimiter_graph", "fingerprint"
|
||||
};
|
||||
|
||||
for (String factory : disallowedFactories) {
|
||||
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
||||
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
||||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
"Expected IllegalArgumentException for factory " + factory,
|
||||
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
|
||||
|
||||
assertEquals(factory, "Token filter [" + factory
|
||||
+ "] cannot be used to parse synonyms",
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
|
||||
.put("path.home", createTempDir().toString())
|
||||
.putList("common_words", "a", "b")
|
||||
.put("output_unigrams", "true")
|
||||
.build();
|
||||
idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
List<String> expectedWarnings = new ArrayList<>();
|
||||
for (String factory : disallowedFactories) {
|
||||
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
||||
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
||||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
|
||||
stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
|
||||
expectedWarnings.add("Token filter [" + factory
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
}
|
||||
|
||||
assertWarnings(expectedWarnings.toArray(new String[0]));
|
||||
|
||||
settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED,
|
||||
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("preserve_original", "false")
|
||||
.build();
|
||||
idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
TokenFilterFactory tff = plugin.getTokenFilters().get("multiplexer").get(idxSettings, null, "multiplexer", settings);
|
||||
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
||||
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
||||
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
|
||||
|
||||
assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]",
|
||||
e.getMessage());
|
||||
|
||||
}
|
||||
|
||||
private void match(String analyzerName, String source, String target) throws IOException {
|
||||
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
||||
|
||||
|
|
|
@ -30,11 +30,14 @@ import org.apache.commons.codec.language.bm.Languages.LanguageSet;
|
|||
import org.apache.commons.codec.language.bm.NameType;
|
||||
import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||
import org.apache.commons.codec.language.bm.RuleType;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
|
||||
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
|
||||
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -47,6 +50,10 @@ import java.util.List;
|
|||
|
||||
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER
|
||||
= new DeprecationLogger(LogManager.getLogger(PhoneticTokenFilterFactory.class));
|
||||
|
||||
private final Encoder encoder;
|
||||
private final boolean replace;
|
||||
private int maxcodelength;
|
||||
|
@ -138,4 +145,16 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
}
|
||||
throw new IllegalArgumentException("encoder error");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
|
||||
+ "] will not be usable to parse synonyms after v7.0");
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,9 +19,16 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
|
||||
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
import org.elasticsearch.test.VersionUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -38,4 +45,33 @@ public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase {
|
|||
filters.put("phonetic", PhoneticTokenFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
public void testDisallowedWithSynonyms() throws IOException {
|
||||
|
||||
AnalysisPhoneticPlugin plugin = new AnalysisPhoneticPlugin();
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
|
||||
.put("path.home", createTempDir().toString())
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
TokenFilterFactory tff
|
||||
= plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings);
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, tff::getSynonymFilter);
|
||||
assertEquals("Token filter [phonetic] cannot be used to parse synonyms", e.getMessage());
|
||||
|
||||
settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(),
|
||||
Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
|
||||
.put("path.home", createTempDir().toString())
|
||||
.build();
|
||||
idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
|
||||
tff = plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings);
|
||||
tff.getSynonymFilter();
|
||||
|
||||
assertWarnings("Token filter [phonetic] will not be usable to parse synonyms after v7.0");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -37,16 +37,26 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
|
|||
*/
|
||||
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
|
||||
Function<TokenStream, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
|
||||
(tokenStream, version) -> create.apply(tokenStream));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a pre-configured token filter that may not vary at all.
|
||||
*/
|
||||
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
|
||||
boolean useFilterForParsingSynonyms,
|
||||
Function<TokenStream, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
|
||||
(tokenStream, version) -> create.apply(tokenStream));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a pre-configured token filter that may not vary at all.
|
||||
*/
|
||||
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
|
||||
BiFunction<TokenStream, Version, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
|
||||
(tokenStream, version) -> create.apply(tokenStream, version));
|
||||
}
|
||||
|
||||
|
@ -55,7 +65,7 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
|
|||
*/
|
||||
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
|
||||
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
|
||||
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
|
||||
}
|
||||
|
||||
|
@ -64,16 +74,18 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
|
|||
*/
|
||||
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
|
||||
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create);
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
|
||||
}
|
||||
|
||||
private final boolean useFilterForMultitermQueries;
|
||||
private final boolean useFilterForParsingSynonyms;
|
||||
private final BiFunction<TokenStream, Version, TokenStream> create;
|
||||
|
||||
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
|
||||
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
|
||||
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
|
||||
super(name, cache);
|
||||
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
|
||||
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
|
||||
this.create = create;
|
||||
}
|
||||
|
||||
|
@ -104,6 +116,13 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
|
|||
return create.apply(tokenStream, version);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (useFilterForParsingSynonyms) {
|
||||
return this;
|
||||
}
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
};
|
||||
}
|
||||
return new TokenFilterFactory() {
|
||||
|
@ -116,6 +135,14 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return create.apply(tokenStream, version);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (useFilterForParsingSynonyms) {
|
||||
return this;
|
||||
}
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,16 +19,21 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private static final DeprecationLogger DEPRECATION_LOGGER =
|
||||
new DeprecationLogger(LogManager.getLogger(ShingleTokenFilterFactory.class));
|
||||
|
||||
private final Factory factory;
|
||||
|
||||
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
|
@ -54,8 +59,8 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
||||
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles,
|
||||
tokenSeparator, fillerToken);
|
||||
factory = new Factory("shingle", minShingleSize, maxShingleSize,
|
||||
outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
|
||||
}
|
||||
|
||||
|
||||
|
@ -64,6 +69,19 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return factory.create(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
|
||||
throw new IllegalArgumentException("Token filter [" + name() +
|
||||
"] cannot be used to parse synonyms");
|
||||
}
|
||||
else {
|
||||
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter " + name()
|
||||
+ "] will not be usable to parse synonym after v7.0");
|
||||
}
|
||||
return this;
|
||||
|
||||
}
|
||||
|
||||
public Factory getInnerFactory() {
|
||||
return this.factory;
|
||||
|
|
|
@ -67,7 +67,8 @@ public interface TokenFilterFactory {
|
|||
* Return a version of this TokenFilterFactory appropriate for synonym parsing
|
||||
*
|
||||
* Filters that should not be applied to synonyms (for example, those that produce
|
||||
* multiple tokens) can return {@link #IDENTITY_FILTER}
|
||||
* multiple tokens) should throw an exception
|
||||
*
|
||||
*/
|
||||
default TokenFilterFactory getSynonymFilter() {
|
||||
return this;
|
||||
|
|
Loading…
Reference in New Issue