From 62d19695956692a95d2777f87de01b191da30a56 Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Tue, 20 Jun 2017 21:50:33 +0900 Subject: [PATCH] Parse synonyms with the same analysis chain (#8049) * [Analysis] Parse synonyms with the same analysis chain Synonym Token Filter / Synonym Graph Filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. Close #7199 --- .../analyze/TransportAnalyzeAction.java | 69 ++++---- .../index/analysis/AnalysisRegistry.java | 4 +- .../analysis/CustomAnalyzerProvider.java | 46 +++++- .../SynonymGraphTokenFilterFactory.java | 48 +++++- .../analysis/SynonymTokenFilterFactory.java | 148 +++++++++++++----- .../synonyms/SynonymsAnalysisTests.java | 52 ++++++ .../indices/analyze/AnalyzeActionIT.java | 2 +- .../index/analysis/synonyms/synonyms.json | 45 +++++- .../synonym-graph-tokenfilter.asciidoc | 9 +- .../tokenfilters/synonym-tokenfilter.asciidoc | 12 +- .../migration/migrate_6_0/mappings.asciidoc | 11 ++ .../test/indices.analyze/10_synonyms.yml | 35 +++++ .../test/indices.analyze/10_analyze.yml | 35 ++++- 13 files changed, 424 insertions(+), 92 deletions(-) create mode 100644 modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_synonyms.yml diff --git a/core/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java b/core/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java index 11566378085..b7da50139bb 100644 --- a/core/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java +++ b/core/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java @@ -49,6 +49,7 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; +import org.elasticsearch.index.analysis.CustomAnalyzerProvider; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -183,13 +184,14 @@ public class TransportAnalyzeAction extends TransportSingleShardAction tokenizerFactory = parseTokenizerFactory(request, indexAnalyzers, analysisRegistry, environment); - TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; - tokenFilterFactories = getTokenFilterFactories(request, indexSettings, analysisRegistry, environment, tokenFilterFactories); + List charFilterFactoryList = parseCharFilterFactories(request, indexSettings, analysisRegistry, environment); - CharFilterFactory[] charFilterFactories = new CharFilterFactory[0]; - charFilterFactories = getCharFilterFactories(request, indexSettings, analysisRegistry, environment, charFilterFactories); + List tokenFilterFactoryList = parseTokenFilterFactories(request, indexSettings, analysisRegistry, + environment, tokenizerFactory, charFilterFactoryList); - analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), charFilterFactories, tokenFilterFactories); + analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), + charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]), + tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])); closeAnalyzer = true; } else if (analyzer == null) { if (indexAnalyzers == null) { @@ -462,12 +464,13 @@ public class TransportAnalyzeAction extends TransportSingleShardAction parseCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry, + Environment environment) throws IOException { + List charFilterFactoryList = new ArrayList<>(); if (request.charFilters() != null && request.charFilters().size() > 0) { - charFilterFactories = new CharFilterFactory[request.charFilters().size()]; - for (int i = 0; i < request.charFilters().size(); i++) { - final AnalyzeRequest.NameOrDefinition charFilter = request.charFilters().get(i); + List charFilters = request.charFilters(); + for (AnalyzeRequest.NameOrDefinition charFilter : charFilters) { + CharFilterFactory charFilterFactory; // parse anonymous settings if (charFilter.definition != null) { Settings settings = getAnonymousSettings(charFilter.definition); @@ -481,7 +484,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction charFilterFactoryFactory; if (indexSettings == null) { @@ -489,31 +492,34 @@ public class TransportAnalyzeAction extends TransportSingleShardAction parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry, + Environment environment, Tuple tokenizerFactory, + List charFilterFactoryList) throws IOException { + List tokenFilterFactoryList = new ArrayList<>(); if (request.tokenFilters() != null && request.tokenFilters().size() > 0) { - tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().size()]; - for (int i = 0; i < request.tokenFilters().size(); i++) { - final AnalyzeRequest.NameOrDefinition tokenFilter = request.tokenFilters().get(i); + List tokenFilters = request.tokenFilters(); + for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) { + TokenFilterFactory tokenFilterFactory; // parse anonymous settings if (tokenFilter.definition != null) { Settings settings = getAnonymousSettings(tokenFilter.definition); @@ -527,7 +533,11 @@ public class TransportAnalyzeAction extends TransportSingleShardAction tokenFilterFactoryFactory; if (indexSettings == null) { @@ -535,23 +545,26 @@ public class TransportAnalyzeAction extends TransportSingleShardAction parseTokenizerFactory(AnalyzeRequest request, IndexAnalyzers indexAnalzyers, diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index e047e15e448..e8134244f04 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -318,12 +318,12 @@ public final class AnalysisRegistry implements Closeable { T factory = null; if (typeName == null) { if (currentSettings.get("tokenizer") != null) { - factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings); + factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment); } else { throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer"); } } else if (typeName.equals("custom")) { - factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings); + factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment); } if (factory != null) { factories.put(name, factory); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java index 3bf5d43375c..e9654719bdc 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java @@ -20,6 +20,7 @@ package org.elasticsearch.index.analysis; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.TextFieldMapper; @@ -34,13 +35,15 @@ import java.util.Map; public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider { private final Settings analyzerSettings; + private final Environment environment; private CustomAnalyzer customAnalyzer; public CustomAnalyzerProvider(IndexSettings indexSettings, - String name, Settings settings) { + String name, Settings settings, Environment environment) { super(indexSettings, name, settings); this.analyzerSettings = settings; + this.environment = environment; } public void build(final Map tokenizers, final Map charFilters, @@ -65,6 +68,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider tokenFilterList = new ArrayList<>(tokenFilterNames.length); for (String tokenFilterName : tokenFilterNames) { @@ -72,14 +81,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider tokenFilterList, + List charFiltersList, Environment env) { + if (tokenFilter instanceof SynonymGraphTokenFilterFactory) { + List tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList); + + try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer, + charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]), + tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]), + TextFieldMapper.Defaults.POSITION_INCREMENT_GAP, + -1)){ + tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env); + } + + } else if (tokenFilter instanceof SynonymTokenFilterFactory) { + List tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList); + try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer, + charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]), + tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]), + TextFieldMapper.Defaults.POSITION_INCREMENT_GAP, + -1)) { + tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env); + } + } + return tokenFilter; + } + @Override public CustomAnalyzer get() { return this.customAnalyzer; diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java index cfb37f0b075..2da3d8bc07a 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java @@ -19,13 +19,19 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.elasticsearch.common.io.FastStringReader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import java.io.IOException; +import java.io.Reader; public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry, @@ -35,7 +41,45 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { - // fst is null means no synonyms - return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase); + throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first"); + } + + Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){ + return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env)); + } + + public class Factory implements TokenFilterFactory{ + + private final String name; + private final SynonymMap synonymMap; + + public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) { + this.name = name; + + try { + SynonymMap.Builder parser; + if ("wordnet".equalsIgnoreCase(format)) { + parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); + ((WordnetSynonymParser) parser).parse(rulesReader); + } else { + parser = new SolrSynonymParser(true, expand, analyzerForParseSynonym); + ((SolrSynonymParser) parser).parse(rulesReader); + } + synonymMap = parser.build(); + } catch (Exception e) { + throw new IllegalArgumentException("failed to build synonyms", e); + } + } + + @Override + public String name() { + return this.name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + // fst is null means no synonyms + return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase); + } } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java index 0e23089827c..0815af44007 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java @@ -23,35 +23,80 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.elasticsearch.Version; import org.elasticsearch.common.io.FastStringReader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisModule; +import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; +import java.nio.file.Files; import java.util.List; public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { - protected final SynonymMap synonymMap; + /** + * @deprecated this property only works with tokenizer property + */ + @Deprecated protected final boolean ignoreCase; + protected final String format; + protected final boolean expand; + protected final Settings settings; public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry, String name, Settings settings) throws IOException { super(indexSettings, name, settings); + this.settings = settings; - Reader rulesReader = null; + this.ignoreCase = + settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger); + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha3) && settings.get("ignore_case") != null) { + deprecationLogger.deprecated( + "This tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. " + + "If you need ignore case with this filter, you should set lowercase filter before this"); + } + + this.expand = + settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger); + + // for backward compatibility + if (indexSettings.getIndexVersionCreated().before(Version.V_6_0_0_alpha3)) { + String tokenizerName = settings.get("tokenizer", "whitespace"); + AnalysisModule.AnalysisProvider tokenizerFactoryFactory = + analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings); + if (tokenizerFactoryFactory == null) { + throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter"); + } + final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName, + AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, + AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName)); + this.tokenizerFactory = tokenizerFactory; + } else { + this.tokenizerFactory = null; + } + + this.format = settings.get("format", ""); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first"); + } + + protected Reader getRulesFromSettings(Environment env) { + Reader rulesReader; if (settings.getAsArray("synonyms", null) != null) { - List rules = Analysis.getWordList(env, settings, "synonyms"); + List rulesList = Analysis.getWordList(env, settings, "synonyms"); StringBuilder sb = new StringBuilder(); - for (String line : rules) { + for (String line : rulesList) { sb.append(line).append(System.lineSeparator()); } rulesReader = new FastStringReader(sb.toString()); @@ -60,49 +105,72 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { } else { throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured"); } + return rulesReader; + } - this.ignoreCase = - settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger); - boolean expand = - settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger); + Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){ + return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env)); + } - String tokenizerName = settings.get("tokenizer", "whitespace"); - AnalysisModule.AnalysisProvider tokenizerFactoryFactory = - analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings); - if (tokenizerFactoryFactory == null) { - throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter"); - } - final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName, - AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName)); - Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create(); - TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; - return new TokenStreamComponents(tokenizer, stream); - } - }; + // for backward compatibility + /** + * @deprecated This filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain in 6.0. + */ + @Deprecated + protected final TokenizerFactory tokenizerFactory; - try { - SynonymMap.Builder parser = null; + public class Factory implements TokenFilterFactory{ - if ("wordnet".equalsIgnoreCase(settings.get("format"))) { - parser = new WordnetSynonymParser(true, expand, analyzer); - ((WordnetSynonymParser) parser).parse(rulesReader); + private final String name; + private final SynonymMap synonymMap; + + public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) { + + this.name = name; + + Analyzer analyzer; + if (tokenizerFactory != null) { + analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = tokenizerFactory.create(); + TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; } else { - parser = new SolrSynonymParser(true, expand, analyzer); - ((SolrSynonymParser) parser).parse(rulesReader); + analyzer = analyzerForParseSynonym; } - synonymMap = parser.build(); - } catch (Exception e) { - throw new IllegalArgumentException("failed to build synonyms", e); + try { + SynonymMap.Builder parser; + if ("wordnet".equalsIgnoreCase(format)) { + parser = new WordnetSynonymParser(true, expand, analyzer); + ((WordnetSynonymParser) parser).parse(rulesReader); + } else { + parser = new SolrSynonymParser(true, expand, analyzer); + ((SolrSynonymParser) parser).parse(rulesReader); + } + synonymMap = parser.build(); + } catch (Exception e) { + throw new IllegalArgumentException("failed to build synonyms", e); + } finally { + if (tokenizerFactory != null) { + analyzer.close(); + } + } + } + + @Override + public String name() { + return this.name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + // fst is null means no synonyms + return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase); } } - @Override - public TokenStream create(TokenStream tokenStream) { - // fst is null means no synonyms - return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase); - } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTests.java b/core/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTests.java index c4842e497ef..b5640cdd120 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTests.java @@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.queryparser.classic.ParseException; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.logging.Loggers; @@ -41,6 +42,8 @@ import java.nio.file.Files; import java.nio.file.Path; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.startsWith; public class SynonymsAnalysisTests extends ESTestCase { protected final Logger logger = Loggers.getLogger(getClass()); @@ -69,8 +72,57 @@ public class SynonymsAnalysisTests extends ESTestCase { match("synonymAnalyzerWordnet", "abstain", "abstain refrain desist"); match("synonymAnalyzerWordnet_file", "abstain", "abstain refrain desist"); match("synonymAnalyzerWithsettings", "kimchy", "sha hay"); + match("synonymAnalyzerWithStopAfterSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,"); + match("synonymAnalyzerWithStopBeforeSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,"); + match("synonymAnalyzerWithStopSynonymAfterSynonym", "kimchy is the dude abides", "shay is the man!"); + match("synonymAnalyzerExpand", "kimchy is the dude abides", "kimchy shay is the dude elasticsearch abides man!"); + match("synonymAnalyzerExpandWithStopAfterSynonym", "kimchy is the dude abides", "shay is the dude abides man!"); + } + public void testSynonymWordDeleteByAnalyzer() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonym.type", "synonym") + .putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay", "dude => elasticsearch", "abides => man!") + .put("index.analysis.filter.stop_within_synonym.type", "stop") + .putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch") + .put("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.filter", "stop_within_synonym","synonym") + .put().build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + try { + indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers; + fail("fail! due to synonym word deleted by analyzer"); + } catch (Exception e) { + assertThat(e, instanceOf(IllegalArgumentException.class)); + assertThat(e.getMessage(), startsWith("failed to build synonyms")); + } + } + + public void testExpandSynonymWordDeleteByAnalyzer() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonym_expand.type", "synonym") + .putArray("index.analysis.filter.synonym_expand.synonyms", "kimchy, shay", "dude, elasticsearch", "abides, man!") + .put("index.analysis.filter.stop_within_synonym.type", "stop") + .putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch") + .put("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.filter", "stop_within_synonym","synonym_expand") + .put().build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + try { + indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers; + fail("fail! due to synonym word deleted by analyzer"); + } catch (Exception e) { + assertThat(e, instanceOf(IllegalArgumentException.class)); + assertThat(e.getMessage(), startsWith("failed to build synonyms")); + } + } + + private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); diff --git a/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java b/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java index 85787c2a3e2..dd556c56e30 100644 --- a/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java +++ b/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java @@ -383,7 +383,7 @@ public class AnalyzeActionIT extends ESIntegTestCase { assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPositionLength(), equalTo(1)); // tokenfilter({"type": "stop", "stopwords": ["foo", "buzz"]}) - assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter_[1]")); + assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter")); assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens().length, equalTo(1)); assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getTerm(), equalTo("test")); diff --git a/core/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.json b/core/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.json index fe5f4d4016c..9cb0bdd6ef1 100644 --- a/core/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.json +++ b/core/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.json @@ -3,11 +3,11 @@ "analysis":{ "analyzer":{ "synonymAnalyzer":{ - "tokenizer":"standard", + "tokenizer":"whitespace", "filter":[ "synonym" ] }, "synonymAnalyzer_file":{ - "tokenizer":"standard", + "tokenizer":"whitespace", "filter":[ "synonym_file" ] }, "synonymAnalyzerWordnet":{ @@ -21,6 +21,26 @@ "synonymAnalyzerWithsettings":{ "tokenizer":"trigram", "filter":["synonymWithTokenizerSettings"] + }, + "synonymAnalyzerWithStopBeforeSynonym": { + "tokenizer":"whitespace", + "filter":["stop","synonym"] + }, + "synonymAnalyzerWithStopAfterSynonym":{ + "tokenizer":"whitespace", + "filter":["synonym","stop"] + }, + "synonymAnalyzerWithStopSynonymAfterSynonym":{ + "tokenizer":"whitespace", + "filter":["synonym","stop_within_synonym"] + }, + "synonymAnalyzerExpand":{ + "tokenizer": "whitespace", + "filter":["synonym_expand"] + }, + "synonymAnalyzerExpandWithStopAfterSynonym":{ + "tokenizer": "whitespace", + "filter":["synonym_expand", "stop_within_synonym"] } }, "tokenizer":{ @@ -61,10 +81,23 @@ "type":"synonym", "synonyms":[ "kimchy => shay" - ], - "tokenizer" : "trigram", - "min_gram" : 3, - "max_gram" : 3 + ] + }, + "stop":{ + "type": "stop", + "stopwords":["stop","synonym"] + }, + "stop_within_synonym":{ + "type": "stop", + "stopwords":["kimchy", "elasticsearch"] + }, + "synonym_expand":{ + "type":"synonym", + "synonyms":[ + "kimchy , shay", + "dude , elasticsearch", + "abides , man!" + ] } } } diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index d29ec51e3d4..e1f77332fd4 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -50,11 +50,14 @@ PUT /test_index The above configures a `search_synonyms` filter, with a path of `analysis/synonym.txt` (relative to the `config` location). The `search_synonyms` analyzer is then configured with the filter. -Additional settings are: `ignore_case` (defaults to `false`), and -`expand` (defaults to `true`). +Additional settings are: `expand` (defaults to `true`). + +[float] +==== `tokenizer` and `ignore_case` are deprecated The `tokenizer` parameter controls the tokenizers that will be used to -tokenize the synonym, and defaults to the `whitespace` tokenizer. +tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0.. +The `ignore_case` parameter works with `tokenizer` parameter only. Two synonym formats are supported: Solr, WordNet. diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index 4f69cbf3458..68d3f444b2d 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -34,11 +34,17 @@ PUT /test_index The above configures a `synonym` filter, with a path of `analysis/synonym.txt` (relative to the `config` location). The `synonym` analyzer is then configured with the filter. Additional -settings are: `ignore_case` (defaults to `false`), and `expand` -(defaults to `true`). +settings is: `expand` (defaults to `true`). + +This filter tokenize synonyms with whatever tokenizer and token filters +appear before it in the chain. + +[float] +==== `tokenizer` and `ignore_case` are deprecated The `tokenizer` parameter controls the tokenizers that will be used to -tokenize the synonym, and defaults to the `whitespace` tokenizer. +tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0.. +The `ignore_case` parameter works with `tokenizer` parameter only. Two synonym formats are supported: Solr, WordNet. diff --git a/docs/reference/migration/migrate_6_0/mappings.asciidoc b/docs/reference/migration/migrate_6_0/mappings.asciidoc index 369ba3da162..e47c9562db0 100644 --- a/docs/reference/migration/migrate_6_0/mappings.asciidoc +++ b/docs/reference/migration/migrate_6_0/mappings.asciidoc @@ -29,3 +29,14 @@ now disallowed for these indices' mappings. Previously Elasticsearch would silently ignore any dynamic templates that included a `match_mapping_type` type that was unrecognized. An exception is now thrown on an unrecognized type. + +==== Synonym Token Filter + +In 6.0, Synonym Token Filter tokenize synonyms with whatever +tokenizer and token filters appear before it in the chain. + +`tokenizer` and `ignore_case` are deprecated. +These parameters are still left for backwards compatibility +for indices that created before 6.0. +And elasticsearch ignores these properties for new indices. + diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_synonyms.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_synonyms.yml new file mode 100644 index 00000000000..75dff3c7096 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_synonyms.yml @@ -0,0 +1,35 @@ +"Synonym filter with char_filter": + # Tests analyze with synonym and char_filter. This is in the analysis-common module + # because there are no char filters in core. + - skip: + version: " - 5.99.99" + reason: to support synonym same analysis chain were added in 6.0.0 + - do: + indices.create: + index: test_synonym_with_charfilter + body: + settings: + index: + analysis: + analyzer: + synonymAnalyzerWithCharfilter: + tokenizer: whitespace + char_filter: ["html_strip"] + filter: ["synonym"] + filter: + synonym: + type: synonym + synonyms: ["

kimchy

=> shay", "dude => elasticsearch", "abides => man!"] + + - do: + indices.analyze: + index: test_synonym_with_charfilter + body: + analyzer: "synonymAnalyzerWithCharfilter" + text: "kimchy is the dude abides" + - length: { tokens: 5 } + - match: { tokens.0.token: shay } + - match: { tokens.1.token: is } + - match: { tokens.2.token: the } + - match: { tokens.3.token: elasticsearch } + - match: { tokens.4.token: man! } diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml index 93ce5c8c807..544c022e2cd 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml @@ -73,5 +73,38 @@ - match: { detail.tokenizer.tokens.0.token: foo } - match: { detail.tokenizer.tokens.1.token: bar } - match: { detail.tokenizer.tokens.2.token: buzz } - - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" } + - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" } - match: { detail.tokenfilters.0.tokens.0.token: bar } + +--- +"Synonym filter with tokenizer": + - skip: + version: " - 5.99.99" + reason: to support synonym same analysis chain were added in 6.0.0 + - do: + indices.create: + index: test_synonym + body: + settings: + index: + analysis: + tokenizer: + trigram: + type: nGram + min_gram: 3 + max_gram: 3 + filter: + synonym: + type: synonym + synonyms: ["kimchy => shay"] + + - do: + indices.analyze: + index: test_synonym + body: + tokenizer: trigram + filter: [synonym] + text: kimchy + - length: { tokens: 2 } + - match: { tokens.0.token: sha } + - match: { tokens.1.token: hay }