From 5107949402855d72125d2dbec5a4aaf025201d0f Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 19 Sep 2018 15:52:14 +0100 Subject: [PATCH] Allow TokenFilterFactories to rewrite themselves against their preceding chain (#33702) We currently special-case SynonymFilterFactory and SynonymGraphFilterFactory, which need to know their predecessors in the analysis chain in order to correctly analyze their synonym lists. This special-casing doesn't work with Referring filter factories, such as the Multiplexer or Conditional filters. We also have a number of filters (eg the Multiplexer) that will break synonyms when they appear before them in a chain, because they produce multiple tokens at the same position. This commit adds two methods to the TokenFilterFactory interface. * `getChainAwareTokenFilterFactory()` allows a filter factory to rewrite itself against its preceding filter chain, or to resolve references to other filters. It replaces `ReferringFilterFactory` and `CustomAnalyzerProvider.checkAndApplySynonymFilter`, and by default returns `this`. * `getSynonymFilter()` defines whether or not a filter should be applied when building a synonym list `Analyzer`. By default it returns `true`. Fixes #33609 --- .../multiplexer-tokenfilter.asciidoc | 10 ++- .../common/MultiplexerTokenFilterFactory.java | 71 ++++++++------- .../ScriptedConditionTokenFilterFactory.java | 64 +++++++++----- .../common/SynonymsAnalysisTests.java | 21 +++++ .../analyze/TransportAnalyzeAction.java | 74 +++++++++------- .../index/analysis/AnalysisRegistry.java | 12 +-- .../analysis/CustomAnalyzerProvider.java | 31 +------ .../analysis/ReferringFilterFactory.java | 37 -------- .../SynonymGraphTokenFilterFactory.java | 56 +++++------- .../analysis/SynonymTokenFilterFactory.java | 87 ++++++++++--------- .../index/analysis/TokenFilterFactory.java | 42 +++++++++ .../CategorizationAnalyzer.java | 7 +- 12 files changed, 271 insertions(+), 241 deletions(-) delete mode 100644 server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java diff --git a/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc index 51937084e39..e2427071fbb 100644 --- a/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc @@ -113,4 +113,12 @@ And it'd respond: // TESTRESPONSE <1> The stemmer has also emitted a token `home` at position 1, but because it is a -duplicate of this token it has been removed from the token stream \ No newline at end of file +duplicate of this token it has been removed from the token stream + +NOTE: The synonym and synonym_graph filters use their preceding analysis chain to +parse and analyse their synonym lists, and ignore any token filters in the chain +that produce multiple tokens at the same position. This means that any filters +within the multiplexer will be ignored for the purpose of synonyms. If you want to +use filters contained within the multiplexer for parsing synonyms (for example, to +apply stemming to the synonym lists), then you should append the synonym filter +to the relevant multiplexer filter list. \ No newline at end of file diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java index 1cf5303a772..c3e0d5133c3 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java @@ -29,33 +29,20 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; -import org.elasticsearch.index.analysis.ReferringFilterFactory; +import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.function.Function; -public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory { +public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory { - private List filters; private List filterNames; private final boolean preserveOriginal; - private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() { - @Override - public String name() { - return "identity"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return tokenStream; - } - }; - public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException { super(indexSettings, name, settings); this.filterNames = settings.getAsList("filters"); @@ -64,31 +51,56 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im @Override public TokenStream create(TokenStream tokenStream) { - List> functions = new ArrayList<>(); - for (TokenFilterFactory tff : filters) { - functions.add(tff::create); - } - return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions)); + throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first"); } @Override - public void setReferences(Map factories) { - filters = new ArrayList<>(); + public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, + List previousTokenFilters, + Function allFilters) { + List filters = new ArrayList<>(); if (preserveOriginal) { - filters.add(IDENTITY_FACTORY); + filters.add(IDENTITY_FILTER); } for (String filter : filterNames) { String[] parts = Strings.tokenizeToStringArray(filter, ","); if (parts.length == 1) { - filters.add(resolveFilterFactory(factories, parts[0])); + TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]); + factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters); + filters.add(factory); } else { + List existingChain = new ArrayList<>(previousTokenFilters); List chain = new ArrayList<>(); for (String subfilter : parts) { - chain.add(resolveFilterFactory(factories, subfilter)); + TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter); + factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters); + chain.add(factory); + existingChain.add(factory); } filters.add(chainFilters(filter, chain)); } } + + return new TokenFilterFactory() { + @Override + public String name() { + return MultiplexerTokenFilterFactory.this.name(); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + List> functions = new ArrayList<>(); + for (TokenFilterFactory tff : filters) { + functions.add(tff::create); + } + return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions)); + } + + @Override + public TokenFilterFactory getSynonymFilter() { + return IDENTITY_FILTER; + } + }; } private TokenFilterFactory chainFilters(String name, List filters) { @@ -108,11 +120,12 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im }; } - private TokenFilterFactory resolveFilterFactory(Map factories, String name) { - if (factories.containsKey(name) == false) { + private TokenFilterFactory resolveFilterFactory(Function factories, String name) { + TokenFilterFactory factory = factories.apply(name); + if (factory == null) { throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]"); } else { - return factories.get(name); + return factory; } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java index 56f60bb874a..b194a5ac111 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java @@ -24,26 +24,24 @@ import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; -import org.elasticsearch.index.analysis.ReferringFilterFactory; +import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.script.Script; import org.elasticsearch.script.ScriptService; import org.elasticsearch.script.ScriptType; -import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.function.Function; /** * A factory for a conditional token filter that only applies child filters if the underlying token * matches an {@link AnalysisPredicateScript} */ -public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory { +public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory { private final AnalysisPredicateScript.Factory factory; - private final List filters = new ArrayList<>(); private final List filterNames; ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name, @@ -65,13 +63,43 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact @Override public TokenStream create(TokenStream tokenStream) { - Function filter = in -> { - for (TokenFilterFactory tff : filters) { - in = tff.create(in); + throw new UnsupportedOperationException("getChainAwareTokenFilterFactory should be called first"); + } + + @Override + public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, + List previousTokenFilters, + Function allFilters) { + List filters = new ArrayList<>(); + List existingChain = new ArrayList<>(previousTokenFilters); + for (String filter : filterNames) { + TokenFilterFactory tff = allFilters.apply(filter); + if (tff == null) { + throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() + + "] refers to undefined token filter [" + filter + "]"); + } + tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters); + filters.add(tff); + existingChain.add(tff); + } + + return new TokenFilterFactory() { + @Override + public String name() { + return ScriptedConditionTokenFilterFactory.this.name(); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + Function filter = in -> { + for (TokenFilterFactory tff : filters) { + in = tff.create(in); + } + return in; + }; + return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance()); } - return in; }; - return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance()); } private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter { @@ -80,29 +108,17 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact private final AnalysisPredicateScript.Token token; ScriptedConditionTokenFilter(TokenStream input, Function inputFactory, - AnalysisPredicateScript script) { + AnalysisPredicateScript script) { super(input, inputFactory); this.script = script; this.token = new AnalysisPredicateScript.Token(this); } @Override - protected boolean shouldFilter() throws IOException { + protected boolean shouldFilter() { token.updatePosition(); return script.execute(token); } } - @Override - public void setReferences(Map factories) { - for (String filter : filterNames) { - TokenFilterFactory tff = factories.get(filter); - if (tff == null) { - throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() + - "] refers to undefined token filter [" + filter + "]"); - } - filters.add(tff); - } - } - } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java index b66f0e1a7f1..942b4876077 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.Version; @@ -117,6 +118,26 @@ public class SynonymsAnalysisTests extends ESTestCase { } } + public void testSynonymsWithMultiplexer() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms.type", "synonym") + .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer") + .put("index.analysis.filter.my_english.type", "stemmer") + .put("index.analysis.filter.my_english.language", "porter2") + .put("index.analysis.filter.stem_repeat.type", "multiplexer") + .putList("index.analysis.filter.stem_repeat.filters", "my_english, synonyms") + .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "stem_repeat") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd", + new String[]{ "some", "developers", "develop", "programm", "are", "odd" }, + new int[]{ 1, 1, 0, 0, 1, 1 }); + } private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java b/server/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java index 5c5da62571f..f09af47ccfa 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java @@ -48,11 +48,9 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; -import org.elasticsearch.index.analysis.CustomAnalyzerProvider; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.NamedAnalyzer; -import org.elasticsearch.index.analysis.ReferringFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.mapper.KeywordFieldMapper; @@ -66,6 +64,7 @@ import org.elasticsearch.transport.TransportService; import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -73,6 +72,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.function.Function; /** * Transport action used to execute analyze requests @@ -571,11 +571,48 @@ public class TransportAnalyzeAction extends TransportSingleShardAction { + + private final AnalysisRegistry analysisRegistry; + private final IndexSettings indexSettings; + Map prebuiltFilters; + + public DeferredTokenFilterRegistry(AnalysisRegistry analysisRegistry, IndexSettings indexSettings) { + this.analysisRegistry = analysisRegistry; + if (indexSettings == null) { + // Settings are null when _analyze is called with no index name, so + // we create dummy settings which will make prebuilt analysis components + // available + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) + .build(); + IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build(); + indexSettings = new IndexSettings(metaData, Settings.EMPTY); + } + this.indexSettings = indexSettings; + } + + @Override + public TokenFilterFactory apply(String s) { + if (prebuiltFilters == null) { + try { + prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + return prebuiltFilters.get(s); + } + } + private static List parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry, Environment environment, Tuple tokenizerFactory, List charFilterFactoryList, boolean normalizer) throws IOException { List tokenFilterFactoryList = new ArrayList<>(); - List referringFilters = new ArrayList<>(); + DeferredTokenFilterRegistry deferredRegistry = new DeferredTokenFilterRegistry(analysisRegistry, indexSettings); if (request.tokenFilters() != null && request.tokenFilters().size() > 0) { List tokenFilters = request.tokenFilters(); for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) { @@ -594,11 +631,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction tokenFilterFactoryFactory; @@ -616,8 +650,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings); - for (ReferringFilterFactory rff : referringFilters) { - rff.setReferences(prebuiltFilters); - } - - } return tokenFilterFactoryList; } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index c61a7cf0706..d7a283f3158 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -167,17 +167,7 @@ public final class AnalysisRegistry implements Closeable { tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings))); tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings))); - Map mappings - = buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters); - - // ReferringTokenFilters require references to other tokenfilters, so we pass these in - // after all factories have been registered - for (TokenFilterFactory tff : mappings.values()) { - if (tff instanceof ReferringFilterFactory) { - ((ReferringFilterFactory)tff).setReferences(mappings); - } - } - return mappings; + return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters); } public Map buildTokenizerFactories(IndexSettings indexSettings) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java b/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java index 4ba07805164..a24c9aef790 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java @@ -81,9 +81,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider tokenFilterList, - List charFiltersList, Environment env) { - if (tokenFilter instanceof SynonymGraphTokenFilterFactory) { - List tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList); - - try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer, - charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]), - tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]), - TextFieldMapper.Defaults.POSITION_INCREMENT_GAP, - -1)){ - tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env); - } - - } else if (tokenFilter instanceof SynonymTokenFilterFactory) { - List tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList); - try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer, - charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]), - tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]), - TextFieldMapper.Defaults.POSITION_INCREMENT_GAP, - -1)) { - tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env); - } - } - return tokenFilter; - } - @Override public CustomAnalyzer get() { return this.customAnalyzer; diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java deleted file mode 100644 index 9eb9bc2dbd6..00000000000 --- a/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import java.util.Map; - -/** - * Marks a {@link TokenFilterFactory} that refers to other filter factories. - * - * The analysis registry will call {@link #setReferences(Map)} with a map of all - * available TokenFilterFactories after all factories have been registered - */ -public interface ReferringFilterFactory { - - /** - * Called with a map of all registered filter factories - */ - void setReferences(Map factories); - -} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java index 24dcb6d33fe..200e426fbd4 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java @@ -28,9 +28,11 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import java.io.IOException; -import java.io.Reader; +import java.util.List; +import java.util.function.Function; public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { + public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry, String name, Settings settings) throws IOException { super(indexSettings, env, analysisRegistry, name, settings); @@ -41,42 +43,24 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first"); } - Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){ - return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env)); - } - - public class Factory implements TokenFilterFactory{ - - private final String name; - private final SynonymMap synonymMap; - - public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) { - this.name = name; - - try { - SynonymMap.Builder parser; - if ("wordnet".equalsIgnoreCase(format)) { - parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ESWordnetSynonymParser) parser).parse(rulesReader); - } else { - parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ESSolrSynonymParser) parser).parse(rulesReader); - } - synonymMap = parser.build(); - } catch (Exception e) { - throw new IllegalArgumentException("failed to build synonyms", e); + @Override + public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, + List previousTokenFilters, + Function allFilters) { + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); + final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); + final String name = name(); + return new TokenFilterFactory() { + @Override + public String name() { + return name; } - } - @Override - public String name() { - return this.name; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - // fst is null means no synonyms - return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, false); - } + @Override + public TokenStream create(TokenStream tokenStream) { + return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false); + } + }; } + } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java index 61c9aba7a3e..c18e8c94310 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.List; +import java.util.function.Function; public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { @@ -38,6 +39,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { protected final boolean expand; protected final boolean lenient; protected final Settings settings; + protected final Environment environment; public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry, String name, Settings settings) throws IOException { @@ -53,6 +55,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { this.expand = settings.getAsBoolean("expand", true); this.lenient = settings.getAsBoolean("lenient", false); this.format = settings.get("format", ""); + this.environment = env; } @Override @@ -60,6 +63,50 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first"); } + @Override + public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, + List previousTokenFilters, + Function allFilters) { + final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); + final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment)); + final String name = name(); + return new TokenFilterFactory() { + @Override + public String name() { + return name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false); + } + }; + } + + protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List charFilters, + List tokenFilters) { + return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]), + tokenFilters.stream() + .map(TokenFilterFactory::getSynonymFilter) + .toArray(TokenFilterFactory[]::new)); + } + + protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) { + try { + SynonymMap.Builder parser; + if ("wordnet".equalsIgnoreCase(format)) { + parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer); + ((ESWordnetSynonymParser) parser).parse(rules); + } else { + parser = new ESSolrSynonymParser(true, expand, lenient, analyzer); + ((ESSolrSynonymParser) parser).parse(rules); + } + return parser.build(); + } catch (Exception e) { + throw new IllegalArgumentException("failed to build synonyms", e); + } + } + protected Reader getRulesFromSettings(Environment env) { Reader rulesReader; if (settings.getAsList("synonyms", null) != null) { @@ -77,44 +124,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { return rulesReader; } - Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){ - return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env)); - } - - public class Factory implements TokenFilterFactory{ - - private final String name; - private final SynonymMap synonymMap; - - public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) { - - this.name = name; - - try { - SynonymMap.Builder parser; - if ("wordnet".equalsIgnoreCase(format)) { - parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ESWordnetSynonymParser) parser).parse(rulesReader); - } else { - parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ESSolrSynonymParser) parser).parse(rulesReader); - } - synonymMap = parser.build(); - } catch (Exception e) { - throw new IllegalArgumentException("failed to build synonyms", e); - } - } - - @Override - public String name() { - return this.name; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - // fst is null means no synonyms - return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, false); - } - } - } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java index c90138d7a23..9d9a48c3a33 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java @@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter; +import java.util.List; +import java.util.function.Function; + public interface TokenFilterFactory { String name(); @@ -36,4 +39,43 @@ public interface TokenFilterFactory { default boolean breaksFastVectorHighlighter() { return false; } + + /** + * Rewrite the TokenFilterFactory to take into account the preceding analysis chain, or refer + * to other TokenFilterFactories + * @param tokenizer the TokenizerFactory for the preceding chain + * @param charFilters any CharFilterFactories for the preceding chain + * @param previousTokenFilters a list of TokenFilterFactories in the preceding chain + * @param allFilters access to previously defined TokenFilterFactories + */ + default TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List charFilters, + List previousTokenFilters, + Function allFilters) { + return this; + } + + /** + * Return a version of this TokenFilterFactory appropriate for synonym parsing + * + * Filters that should not be applied to synonyms (for example, those that produce + * multiple tokens) can return {@link #IDENTITY_FILTER} + */ + default TokenFilterFactory getSynonymFilter() { + return this; + } + + /** + * A TokenFilterFactory that does no filtering to its TokenStream + */ + TokenFilterFactory IDENTITY_FILTER = new TokenFilterFactory() { + @Override + public String name() { + return "identity"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return tokenStream; + } + }; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java index 6111fa139f9..d9af90e00f6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java @@ -20,7 +20,6 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; -import org.elasticsearch.index.analysis.CustomAnalyzerProvider; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule; @@ -217,6 +216,8 @@ public class CategorizationAnalyzer implements Closeable { Tuple tokenizerFactory, List charFilterFactoryList) throws IOException { List tokenFilters = config.getTokenFilters(); + TransportAnalyzeAction.DeferredTokenFilterRegistry deferredRegistry + = new TransportAnalyzeAction.DeferredTokenFilterRegistry(analysisRegistry, null); final List tokenFilterFactoryList = new ArrayList<>(); for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) { TokenFilterFactory tokenFilterFactory; @@ -241,8 +242,8 @@ public class CategorizationAnalyzer implements Closeable { // Need to set anonymous "name" of token_filter tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter", settings); - tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), - tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment); + tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), + charFilterFactoryList, tokenFilterFactoryList, deferredRegistry); } if (tokenFilterFactory == null) { throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");