Ensure TokenFilters only produce single tokens when parsing synonyms (#34331)

A number of tokenfilters can produce multiple tokens at the same position.  This
is a problem when using token chains to parse synonym files, as the SynonymMap
requires that there are no stacked tokens in its input.

This commit ensures that when used to parse synonyms, these tokenfilters either produce
a single version of their input token, or that they throw an error when mappings are 
generated.  In indexes created in elasticsearch 6.x deprecation warnings are emitted in place 
of the error. 

* asciifolding and cjk_bigram produce only the folded or bigrammed token
* decompounders, synonyms and keyword_repeat are skipped
* n-grams, word-delimiter-filter, multiplexer, fingerprint and phonetic throw errors

Fixes #34298
This commit is contained in:
Alan Woodward 2018-11-29 10:35:38 +00:00 committed by GitHub
parent c63d0af913
commit a646f85a99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 529 additions and 17 deletions

View File

@ -175,3 +175,15 @@ PUT /test_index
Using `synonyms_path` to define WordNet synonyms in a file is supported
as well.
=== Parsing synonym files
Elasticsearch will use the token filters preceding the synonym filter
in a tokenizer chain to parse the entries in a synonym file. So, for example, if a
synonym filter is placed after a stemmer, then the stemmer will also be applied
to the synonym entries. Because entries in the synonym map cannot have stacked
positions, some token filters may cause issues here. Token filters that produce
multiple versions of a token may choose which version of the token to emit when
parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
error.

View File

@ -163,3 +163,16 @@ PUT /test_index
Using `synonyms_path` to define WordNet synonyms in a file is supported
as well.
=== Parsing synonym files
Elasticsearch will use the token filters preceding the synonym filter
in a tokenizer chain to parse the entries in a synonym file. So, for example, if a
synonym filter is placed after a stemmer, then the stemmer will also be applied
to the synonym entries. Because entries in the synonym map cannot have stacked
positions, some token filters may cause issues here. Token filters that produce
multiple versions of a token may choose which version of the token to emit when
parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
error.

View File

@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
/**
* Factory for ASCIIFoldingFilter.
@ -51,8 +52,28 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (preserveOriginal == false) {
return this;
} else {
// See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
return new TokenFilterFactory() {
@Override
public String name() {
return ASCIIFoldingTokenFilterFactory.this.name();
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new ASCIIFoldingFilter(tokenStream, false);
}
};
}
}
public TokenStream normalize(TokenStream tokenStream) {
// Normalization should only emit a single token, so always turn off preserveOriginal
return new ASCIIFoldingFilter(tokenStream, false);
}
}

View File

@ -26,6 +26,7 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;
/**
* Contains the common configuration settings between subclasses of this class.
@ -50,4 +51,9 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
}
}
@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER; // don't decompound synonym file
}
}

View File

@ -19,13 +19,17 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import java.util.Arrays;
import java.util.HashSet;
@ -48,6 +52,9 @@ import java.util.Set;
*/
public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(CJKBigramFilterFactory.class));
private final int flags;
private final boolean outputUnigrams;
@ -89,4 +96,18 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
return filter;
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (outputUnigrams) {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() +
"] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
}
}
return this;
}
}

View File

@ -426,7 +426,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless

View File

@ -19,18 +19,25 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;
public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(CommonGramsTokenFilterFactory.class));
private final CharArraySet words;
private final boolean ignoreCase;
@ -58,5 +65,17 @@ public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
return filter;
}
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
} else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
}
return this;
}
}

View File

@ -19,17 +19,24 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class));
private final int minGram;
private final int maxGram;
@ -77,4 +84,16 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
public boolean breaksFastVectorHighlighter() {
return true;
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
}
}

View File

@ -19,18 +19,25 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class));
private final char separator;
private final int maxOutputSize;
@ -47,4 +54,16 @@ public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
return result;
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
}
}

View File

@ -19,12 +19,15 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
@ -40,6 +43,9 @@ import java.util.function.Function;
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class));
private List<String> filterNames;
private final boolean preserveOriginal;
@ -54,6 +60,22 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
if (preserveOriginal) {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return IDENTITY_FILTER;
}
throw new IllegalArgumentException("Token filter [" + name()
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
}
}
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
@ -98,7 +120,18 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
if (preserveOriginal) {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return IDENTITY_FILTER;
}
throw new IllegalArgumentException("Token filter [" + name()
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
}
}
};
}

View File

@ -19,23 +19,27 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.Version;
import org.elasticsearch.index.analysis.TokenFilterFactory;
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(NGramTokenFilterFactory.class));
private final int minGram;
private final int maxGram;
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
@ -60,4 +64,16 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
// TODO: Expose preserveOriginal
return new NGramTokenFilter(tokenStream, minGram, maxGram, false);
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
}
}

View File

@ -49,7 +49,7 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
final String name = name();
return new TokenFilterFactory() {

View File

@ -19,10 +19,12 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
@ -40,6 +42,9 @@ import java.util.function.Function;
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(SynonymTokenFilterFactory.class));
private final String format;
private final boolean expand;
private final boolean lenient;
@ -52,7 +57,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
this.settings = settings;
if (settings.get("ignore_case") != null) {
deprecationLogger.deprecated(
DEPRECATION_LOGGER.deprecated(
"The ignore_case option on the synonym_graph filter is deprecated. " +
"Instead, insert a lowercase filter in the filter chain before the synonym_graph filter.");
}
@ -72,7 +77,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
final String name = name();
return new TokenFilterFactory() {
@ -85,11 +90,19 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
public TokenStream create(TokenStream tokenStream) {
return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
}
@Override
public TokenFilterFactory getSynonymFilter() {
// In order to allow chained synonym filters, we return IDENTITY here to
// ensure that synonyms don't get applied to the synonym map itself,
// which doesn't support stacked input tokens
return IDENTITY_FILTER;
}
};
}
Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> tokenFilters) {
List<TokenFilterFactory> tokenFilters, Function<String, TokenFilterFactory> allFilters) {
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
tokenFilters.stream()
.map(TokenFilterFactory::getSynonymFilter)

View File

@ -19,15 +19,19 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import java.util.List;
import java.util.Set;
@ -45,6 +49,9 @@ import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER =
new DeprecationLogger(LogManager.getLogger(WordDelimiterGraphTokenFilterFactory.class));
private final byte[] charTypeTable;
private final int flags;
private final CharArraySet protoWords;
@ -95,6 +102,18 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
return new WordDelimiterGraphFilter(tokenStream, charTypeTable, flags, protoWords);
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
}
private int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBoolean(key, defaultValue)) {
return flag;

View File

@ -19,15 +19,19 @@
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import java.util.Collection;
import java.util.List;
@ -50,6 +54,9 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_
public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER =
new DeprecationLogger(LogManager.getLogger(WordDelimiterTokenFilterFactory.class));
private final byte[] charTypeTable;
private final int flags;
private final CharArraySet protoWords;
@ -103,6 +110,18 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
protoWords);
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
}
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBoolean(key, defaultValue)) {
return flag;

View File

@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
@ -29,14 +30,20 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;
import org.hamcrest.MatcherAssert;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
@ -118,7 +125,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
}
}
public void testSynonymsWithMultiplexer() throws IOException {
public void testSynonymsWrappedByMultiplexer() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
@ -139,6 +146,180 @@ public class SynonymsAnalysisTests extends ESTestCase {
new int[]{ 1, 1, 0, 0, 1, 1 });
}
public void testAsciiFoldingFilterForSynonyms() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "hoj, height")
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "asciifolding", "synonyms")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj",
new String[]{ "hoj", "height" },
new int[]{ 1, 0 });
}
public void testKeywordRepeatAndSynonyms() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
.put("index.analysis.filter.my_english.type", "stemmer")
.put("index.analysis.filter.my_english.language", "porter2")
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
new String[]{ "programmers", "programm", "develop" },
new int[]{ 1, 0, 0 });
}
public void testChainedSynonymFilters() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms1.type", "synonym")
.putList("index.analysis.filter.synonyms1.synonyms", "term1, term2")
.put("index.analysis.filter.synonyms2.type", "synonym")
.putList("index.analysis.filter.synonyms2.synonyms", "term1, term3")
.put("index.analysis.analyzer.syn.tokenizer", "standard")
.putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("syn"), "term1",
new String[]{ "term1", "term3", "term2" }, new int[]{ 1, 0, 0 });
}
public void testShingleFilters() {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
.put("index.analysis.filter.my_shingle.type", "shingle")
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.my_analyzer.filter", "my_shingle", "synonyms")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
expectThrows(IllegalArgumentException.class, () -> {
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
});
}
public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.putList("word_list", "a")
.put("hyphenation_patterns_path", "foo")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
String[] bypassingFactories = new String[]{
"dictionary_decompounder"
};
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
try (TokenStream ts = analyzer.tokenStream("field", "text")) {
assertThat(ts, instanceOf(KeywordTokenizer.class));
}
}
}
public void testDisallowedTokenFilters() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
.put("path.home", createTempDir().toString())
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
String[] disallowedFactories = new String[]{
"multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram",
"word_delimiter", "word_delimiter_graph", "fingerprint"
};
for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
"Expected IllegalArgumentException for factory " + factory,
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
assertEquals(factory, "Token filter [" + factory
+ "] cannot be used to parse synonyms",
e.getMessage());
}
settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.put("path.home", createTempDir().toString())
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();
idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
List<String> expectedWarnings = new ArrayList<>();
for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
expectedWarnings.add("Token filter [" + factory
+ "] will not be usable to parse synonyms after v7.0");
}
assertWarnings(expectedWarnings.toArray(new String[0]));
settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.put("path.home", createTempDir().toString())
.put("preserve_original", "false")
.build();
idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
TokenFilterFactory tff = plugin.getTokenFilters().get("multiplexer").get(idxSettings, null, "multiplexer", settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]",
e.getMessage());
}
private void match(String analyzerName, String source, String target) throws IOException {
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

View File

@ -30,11 +30,14 @@ import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
@ -47,6 +50,10 @@ import java.util.List;
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER
= new DeprecationLogger(LogManager.getLogger(PhoneticTokenFilterFactory.class));
private final Encoder encoder;
private final boolean replace;
private int maxcodelength;
@ -138,4 +145,16 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
}
throw new IllegalArgumentException("encoder error");
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+ "] will not be usable to parse synonyms after v7.0");
return this;
}
}
}

View File

@ -19,9 +19,16 @@
package org.elasticsearch.index.analysis;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
@ -38,4 +45,33 @@ public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase {
filters.put("phonetic", PhoneticTokenFilterFactory.class);
return filters;
}
public void testDisallowedWithSynonyms() throws IOException {
AnalysisPhoneticPlugin plugin = new AnalysisPhoneticPlugin();
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
.put("path.home", createTempDir().toString())
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
TokenFilterFactory tff
= plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, tff::getSynonymFilter);
assertEquals("Token filter [phonetic] cannot be used to parse synonyms", e.getMessage());
settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(),
Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.put("path.home", createTempDir().toString())
.build();
idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
tff = plugin.getTokenFilters().get("phonetic").get(idxSettings, null, "phonetic", settings);
tff.getSynonymFilter();
assertWarnings("Token filter [phonetic] will not be usable to parse synonyms after v7.0");
}
}

View File

@ -37,16 +37,26 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}
/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
boolean useFilterForParsingSynonyms,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}
/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream, version));
}
@ -55,7 +65,7 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
*/
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
}
@ -64,16 +74,18 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
*/
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create);
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
}
private final boolean useFilterForMultitermQueries;
private final boolean useFilterForParsingSynonyms;
private final BiFunction<TokenStream, Version, TokenStream> create;
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
super(name, cache);
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
this.create = create;
}
@ -104,6 +116,13 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
return create.apply(tokenStream, version);
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (useFilterForParsingSynonyms) {
return this;
}
return IDENTITY_FILTER;
}
};
}
return new TokenFilterFactory() {
@ -116,6 +135,14 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (useFilterForParsingSynonyms) {
return this;
}
return IDENTITY_FILTER;
}
};
}
}

View File

@ -19,16 +19,21 @@
package org.elasticsearch.index.analysis;
import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
private static final DeprecationLogger DEPRECATION_LOGGER =
new DeprecationLogger(LogManager.getLogger(ShingleTokenFilterFactory.class));
private final Factory factory;
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
@ -54,8 +59,8 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles,
tokenSeparator, fillerToken);
factory = new Factory("shingle", minShingleSize, maxShingleSize,
outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}
@ -64,6 +69,19 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
return factory.create(tokenStream);
}
@Override
public TokenFilterFactory getSynonymFilter() {
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0)) {
throw new IllegalArgumentException("Token filter [" + name() +
"] cannot be used to parse synonyms");
}
else {
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter " + name()
+ "] will not be usable to parse synonym after v7.0");
}
return this;
}
public Factory getInnerFactory() {
return this.factory;

View File

@ -67,7 +67,8 @@ public interface TokenFilterFactory {
* Return a version of this TokenFilterFactory appropriate for synonym parsing
*
* Filters that should not be applied to synonyms (for example, those that produce
* multiple tokens) can return {@link #IDENTITY_FILTER}
* multiple tokens) should throw an exception
*
*/
default TokenFilterFactory getSynonymFilter() {
return this;