Allow TokenFilterFactories to rewrite themselves against their preceding chain (#33702)

We currently special-case SynonymFilterFactory and SynonymGraphFilterFactory, which need to know their predecessors in the analysis chain in order to correctly analyze their synonym lists. This special-casing doesn't work with Referring filter factories, such as the Multiplexer or Conditional filters. We also have a number of filters (eg the Multiplexer) that will break synonyms when they appear before them in a chain, because they produce multiple tokens at the same position. This commit adds two methods to the TokenFilterFactory interface. * `getChainAwareTokenFilterFactory()` allows a filter factory to rewrite itself against its preceding filter chain, or to resolve references to other filters. It replaces `ReferringFilterFactory` and `CustomAnalyzerProvider.checkAndApplySynonymFilter`, and by default returns `this`. * `getSynonymFilter()` defines whether or not a filter should be applied when building a synonym list `Analyzer`. By default it returns `true`. Fixes #33609
2018-09-19 15:52:14 +01:00 · 2018-09-19 15:52:14 +01:00 · 5107949402
parent 4190a9f1e9
commit 5107949402
12 changed files with 271 additions and 241 deletions
--- a/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
@ -114,3 +114,11 @@ And it'd respond:

 <1> The stemmer has also emitted a token `home` at position 1, but because it is a
 duplicate of this token it has been removed from the token stream
+
+NOTE: The synonym and synonym_graph filters use their preceding analysis chain to
+parse and analyse their synonym lists, and ignore any token filters in the chain
+that produce multiple tokens at the same position.  This means that any filters
+within the multiplexer will be ignored for the purpose of synonyms.  If you want to
+use filters contained within the multiplexer for parsing synonyms (for example, to
+apply stemming to the synonym lists), then you should append the synonym filter
+to the relevant multiplexer filter list.
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java
@ -29,33 +29,20 @@ import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
-import org.elasticsearch.index.analysis.ReferringFilterFactory;
+import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;
 import java.util.function.Function;

-public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
+public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {

-    private List<TokenFilterFactory> filters;
    private List<String> filterNames;
    private final boolean preserveOriginal;

-    private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
-        @Override
-        public String name() {
-            return "identity";
-        }
-
-        @Override
-        public TokenStream create(TokenStream tokenStream) {
-            return tokenStream;
-        }
-    };
-
    public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
        super(indexSettings, name, settings);
        this.filterNames = settings.getAsList("filters");
@ -64,31 +51,56 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im

    @Override
    public TokenStream create(TokenStream tokenStream) {
-        List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
-        for (TokenFilterFactory tff : filters) {
-            functions.add(tff::create);
-        }
-        return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
+        throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
    }

    @Override
-    public void setReferences(Map<String, TokenFilterFactory> factories) {
-        filters = new ArrayList<>();
+    public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+                                                              List<TokenFilterFactory> previousTokenFilters,
+                                                              Function<String, TokenFilterFactory> allFilters) {
+        List<TokenFilterFactory> filters = new ArrayList<>();
        if (preserveOriginal) {
-            filters.add(IDENTITY_FACTORY);
+            filters.add(IDENTITY_FILTER);
        }
        for (String filter : filterNames) {
            String[] parts = Strings.tokenizeToStringArray(filter, ",");
            if (parts.length == 1) {
-                filters.add(resolveFilterFactory(factories, parts[0]));
+                TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
+                factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
+                filters.add(factory);
            } else {
+                List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
                List<TokenFilterFactory> chain = new ArrayList<>();
                for (String subfilter : parts) {
-                    chain.add(resolveFilterFactory(factories, subfilter));
+                    TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
+                    factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
+                    chain.add(factory);
+                    existingChain.add(factory);
                }
                filters.add(chainFilters(filter, chain));
            }
        }
+
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return MultiplexerTokenFilterFactory.this.name();
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
+                for (TokenFilterFactory tff : filters) {
+                    functions.add(tff::create);
+                }
+                return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
+            }
+
+            @Override
+            public TokenFilterFactory getSynonymFilter() {
+                return IDENTITY_FILTER;
+            }
+        };
    }

    private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
@ -108,11 +120,12 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
        };
    }

-    private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
-        if (factories.containsKey(name) == false) {
+    private TokenFilterFactory resolveFilterFactory(Function<String, TokenFilterFactory> factories, String name) {
+        TokenFilterFactory factory = factories.apply(name);
+        if (factory == null) {
            throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
        } else {
-            return factories.get(name);
+            return factory;
        }
    }

--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java
@ -24,26 +24,24 @@ import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
-import org.elasticsearch.index.analysis.ReferringFilterFactory;
+import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptService;
 import org.elasticsearch.script.ScriptType;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;
 import java.util.function.Function;

 /**
 * A factory for a conditional token filter that only applies child filters if the underlying token
 * matches an {@link AnalysisPredicateScript}
 */
-public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
+public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory {

    private final AnalysisPredicateScript.Factory factory;
-    private final List<TokenFilterFactory> filters = new ArrayList<>();
    private final List<String> filterNames;

    ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
@ -65,13 +63,43 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact

    @Override
    public TokenStream create(TokenStream tokenStream) {
-        Function<TokenStream, TokenStream> filter = in -> {
-            for (TokenFilterFactory tff : filters) {
-                in = tff.create(in);
+        throw new UnsupportedOperationException("getChainAwareTokenFilterFactory should be called first");
+    }
+
+    @Override
+    public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+                                                              List<TokenFilterFactory> previousTokenFilters,
+                                                              Function<String, TokenFilterFactory> allFilters) {
+        List<TokenFilterFactory> filters = new ArrayList<>();
+        List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
+        for (String filter : filterNames) {
+            TokenFilterFactory tff = allFilters.apply(filter);
+            if (tff == null) {
+                throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
+                    "] refers to undefined token filter [" + filter + "]");
+            }
+            tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
+            filters.add(tff);
+            existingChain.add(tff);
+        }
+
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return ScriptedConditionTokenFilterFactory.this.name();
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                Function<TokenStream, TokenStream> filter = in -> {
+                    for (TokenFilterFactory tff : filters) {
+                        in = tff.create(in);
+                    }
+                    return in;
+                };
+                return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
            }
-            return in;
        };
-        return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
    }

    private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
@ -80,29 +108,17 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
        private final AnalysisPredicateScript.Token token;

        ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
-                                               AnalysisPredicateScript script) {
+                                     AnalysisPredicateScript script) {
            super(input, inputFactory);
            this.script = script;
            this.token = new AnalysisPredicateScript.Token(this);
        }

        @Override
-        protected boolean shouldFilter() throws IOException {
+        protected boolean shouldFilter() {
            token.updatePosition();
            return script.execute(token);
        }
    }

-    @Override
-    public void setReferences(Map<String, TokenFilterFactory> factories) {
-        for (String filter : filterNames) {
-            TokenFilterFactory tff = factories.get(filter);
-            if (tff == null) {
-                throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
-                    "] refers to undefined token filter [" + filter + "]");
-            }
-            filters.add(tff);
-        }
-    }
-
 }
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java
@ -20,6 +20,7 @@
 package org.elasticsearch.analysis.common;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.elasticsearch.Version;
@ -117,6 +118,26 @@ public class SynonymsAnalysisTests extends ESTestCase {
        }
    }

+    public void testSynonymsWithMultiplexer() throws IOException {
+        Settings settings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("path.home", createTempDir().toString())
+            .put("index.analysis.filter.synonyms.type", "synonym")
+            .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
+            .put("index.analysis.filter.my_english.type", "stemmer")
+            .put("index.analysis.filter.my_english.language", "porter2")
+            .put("index.analysis.filter.stem_repeat.type", "multiplexer")
+            .putList("index.analysis.filter.stem_repeat.filters", "my_english, synonyms")
+            .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "stem_repeat")
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
+        indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
+
+        BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd",
+            new String[]{ "some", "developers", "develop", "programm", "are", "odd" },
+            new int[]{ 1, 1, 0, 0, 1, 1 });
+    }

    private void match(String analyzerName, String source, String target) throws IOException {
        Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
--- a/server/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java
+++ b/server/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java
@ -48,11 +48,9 @@ import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.CustomAnalyzer;
-import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
-import org.elasticsearch.index.analysis.ReferringFilterFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
@ -66,6 +64,7 @@ import org.elasticsearch.transport.TransportService;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
@ -73,6 +72,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.function.Function;

 /**
 * Transport action used to execute analyze requests
@ -571,11 +571,48 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
        return charFilterFactoryList;
    }

+    public static class DeferredTokenFilterRegistry implements Function<String, TokenFilterFactory> {
+
+        private final AnalysisRegistry analysisRegistry;
+        private final IndexSettings indexSettings;
+        Map<String, TokenFilterFactory> prebuiltFilters;
+
+        public DeferredTokenFilterRegistry(AnalysisRegistry analysisRegistry, IndexSettings indexSettings) {
+            this.analysisRegistry = analysisRegistry;
+            if (indexSettings == null) {
+                // Settings are null when _analyze is called with no index name, so
+                // we create dummy settings which will make prebuilt analysis components
+                // available
+                Settings settings = Settings.builder()
+                    .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
+                    .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                    .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
+                    .build();
+                IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
+                indexSettings = new IndexSettings(metaData, Settings.EMPTY);
+            }
+            this.indexSettings = indexSettings;
+        }
+
+        @Override
+        public TokenFilterFactory apply(String s) {
+            if (prebuiltFilters == null) {
+                try {
+                    prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
+                } catch (IOException e) {
+                    throw new UncheckedIOException(e);
+                }
+            }
+            return prebuiltFilters.get(s);
+        }
+    }
+
    private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
                                                                Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
                                                                List<CharFilterFactory> charFilterFactoryList, boolean normalizer) throws IOException {
        List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
-        List<ReferringFilterFactory> referringFilters = new ArrayList<>();
+        DeferredTokenFilterRegistry deferredRegistry = new DeferredTokenFilterRegistry(analysisRegistry, indexSettings);
        if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
            List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
            for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
@ -594,11 +631,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                    }
                    // Need to set anonymous "name" of tokenfilter
                    tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
-                    tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
-                        charFilterFactoryList, environment);
-                    if (tokenFilterFactory instanceof ReferringFilterFactory) {
-                        referringFilters.add((ReferringFilterFactory)tokenFilterFactory);
-                    }
+                    tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
+                        tokenFilterFactoryList, deferredRegistry);

                } else {
                    AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
@ -616,8 +650,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                        Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
                            AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
                        tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
-                        tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
-                            charFilterFactoryList, environment);
+                        tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
+                            tokenFilterFactoryList, deferredRegistry);
                    }
                }
                if (tokenFilterFactory == null) {
@ -633,26 +667,6 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                tokenFilterFactoryList.add(tokenFilterFactory);
            }
        }
-        if (referringFilters.isEmpty() == false) {
-            // The request included at least one custom referring tokenfilter that has not already been built by the
-            // analysis registry, so we need to set its references.  Note that this will only apply pre-built
-            // tokenfilters
-            if (indexSettings == null) {
-                Settings settings = Settings.builder()
-                    .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
-                    .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
-                    .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
-                    .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
-                    .build();
-                IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
-                indexSettings = new IndexSettings(metaData, Settings.EMPTY);
-            }
-            Map<String, TokenFilterFactory> prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
-            for (ReferringFilterFactory rff : referringFilters) {
-                rff.setReferences(prebuiltFilters);
-            }
-
-        }
        return tokenFilterFactoryList;
    }

--- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@ -167,17 +167,7 @@ public final class AnalysisRegistry implements Closeable {
        tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
        tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));

-        Map<String, TokenFilterFactory> mappings
-            = buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
-
-        // ReferringTokenFilters require references to other tokenfilters, so we pass these in
-        // after all factories have been registered
-        for (TokenFilterFactory tff : mappings.values()) {
-            if (tff instanceof ReferringFilterFactory) {
-                ((ReferringFilterFactory)tff).setReferences(mappings);
-            }
-        }
-        return mappings;
+        return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
    }

    public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
--- a/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java
@ -81,9 +81,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
            if (tokenFilter == null) {
                throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
            }
-            // no need offsetGap for tokenize synonyms
-            tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
-                this.environment);
+            tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(tokenizer, charFiltersList, tokenFilterList, tokenFilters::get);
            tokenFilterList.add(tokenFilter);
        }

@ -95,33 +93,6 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
        );
    }

-    public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
-                                                                List<TokenFilterFactory> tokenFilterList,
-                                                                List<CharFilterFactory> charFiltersList, Environment env) {
-        if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
-            List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
-
-            try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
-                    charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
-                    tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
-                    TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
-                    -1)){
-                tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
-            }
-
-        } else if (tokenFilter instanceof SynonymTokenFilterFactory) {
-            List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
-            try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
-                    charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
-                    tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
-                    TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
-                    -1)) {
-                tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
-            }
-        }
-        return tokenFilter;
-    }
-
    @Override
    public CustomAnalyzer get() {
        return this.customAnalyzer;
--- a/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java
@ -1,37 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.index.analysis;
-
-import java.util.Map;
-
-/**
- * Marks a {@link TokenFilterFactory} that refers to other filter factories.
- *
- * The analysis registry will call {@link #setReferences(Map)} with a map of all
- * available TokenFilterFactories after all factories have been registered
- */
-public interface ReferringFilterFactory {
-
-    /**
-     * Called with a map of all registered filter factories
-     */
-    void setReferences(Map<String, TokenFilterFactory> factories);
-
-}
--- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java
@ -28,9 +28,11 @@ import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;

 import java.io.IOException;
-import java.io.Reader;
+import java.util.List;
+import java.util.function.Function;

 public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
+
    public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
                                     String name, Settings settings) throws IOException {
        super(indexSettings, env, analysisRegistry, name, settings);
@ -41,42 +43,24 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
        throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
    }

-    Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
-        return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
-    }
-
-    public class Factory implements TokenFilterFactory{
-
-        private final String name;
-        private final SynonymMap synonymMap;
-
-        public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
-            this.name = name;
-
-            try {
-                SynonymMap.Builder parser;
-                if ("wordnet".equalsIgnoreCase(format)) {
-                    parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
-                    ((ESWordnetSynonymParser) parser).parse(rulesReader);
-                } else {
-                    parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
-                    ((ESSolrSynonymParser) parser).parse(rulesReader);
-                }
-                synonymMap = parser.build();
-            } catch (Exception e) {
-                throw new IllegalArgumentException("failed to build synonyms", e);
+    @Override
+    public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+                                                              List<TokenFilterFactory> previousTokenFilters,
+                                                              Function<String, TokenFilterFactory> allFilters) {
+        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
+        final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
+        final String name = name();
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return name;
            }
-        }

-        @Override
-        public String name() {
-            return this.name;
-        }
-
-        @Override
-        public TokenStream create(TokenStream tokenStream) {
-            // fst is null means no synonyms
-            return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, false);
-        }
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false);
+            }
+        };
    }
+
 }
--- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java
@ -31,6 +31,7 @@ import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.List;
+import java.util.function.Function;

 public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {

@ -38,6 +39,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
    protected final boolean expand;
    protected final boolean lenient;
    protected final Settings settings;
+    protected final Environment environment;

    public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
                                      String name, Settings settings) throws IOException {
@ -53,6 +55,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
        this.expand = settings.getAsBoolean("expand", true);
        this.lenient = settings.getAsBoolean("lenient", false);
        this.format = settings.get("format", "");
+        this.environment = env;
    }

    @Override
@ -60,6 +63,50 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
        throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
    }

+    @Override
+    public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+                                                              List<TokenFilterFactory> previousTokenFilters,
+                                                              Function<String, TokenFilterFactory> allFilters) {
+        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
+        final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
+        final String name = name();
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return name;
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
+            }
+        };
+    }
+
+    protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+                                            List<TokenFilterFactory> tokenFilters) {
+        return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
+            tokenFilters.stream()
+                .map(TokenFilterFactory::getSynonymFilter)
+                .toArray(TokenFilterFactory[]::new));
+    }
+
+    protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
+        try {
+            SynonymMap.Builder parser;
+            if ("wordnet".equalsIgnoreCase(format)) {
+                parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
+                ((ESWordnetSynonymParser) parser).parse(rules);
+            } else {
+                parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
+                ((ESSolrSynonymParser) parser).parse(rules);
+            }
+            return parser.build();
+        } catch (Exception e) {
+            throw new IllegalArgumentException("failed to build synonyms", e);
+        }
+    }
+
    protected Reader getRulesFromSettings(Environment env) {
        Reader rulesReader;
        if (settings.getAsList("synonyms", null) != null) {
@ -77,44 +124,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
        return rulesReader;
    }

-    Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
-        return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
-    }
-
-    public class Factory implements TokenFilterFactory{
-
-        private final String name;
-        private final SynonymMap synonymMap;
-
-        public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
-
-            this.name = name;
-
-            try {
-                SynonymMap.Builder parser;
-                if ("wordnet".equalsIgnoreCase(format)) {
-                    parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
-                    ((ESWordnetSynonymParser) parser).parse(rulesReader);
-                } else {
-                    parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
-                    ((ESSolrSynonymParser) parser).parse(rulesReader);
-                }
-                synonymMap = parser.build();
-            } catch (Exception e) {
-                throw new IllegalArgumentException("failed to build synonyms", e);
-            }
-        }
-
-        @Override
-        public String name() {
-            return this.name;
-        }
-
-        @Override
-        public TokenStream create(TokenStream tokenStream) {
-            // fst is null means no synonyms
-            return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, false);
-        }
-    }
-
 }
--- a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;

+import java.util.List;
+import java.util.function.Function;
+
 public interface TokenFilterFactory {
    String name();

@ -36,4 +39,43 @@ public interface TokenFilterFactory {
    default boolean breaksFastVectorHighlighter() {
        return false;
    }
+
+    /**
+     * Rewrite the TokenFilterFactory to take into account the preceding analysis chain, or refer
+     * to other TokenFilterFactories
+     * @param tokenizer             the TokenizerFactory for the preceding chain
+     * @param charFilters           any CharFilterFactories for the preceding chain
+     * @param previousTokenFilters  a list of TokenFilterFactories in the preceding chain
+     * @param allFilters            access to previously defined TokenFilterFactories
+     */
+    default TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+                                                               List<TokenFilterFactory> previousTokenFilters,
+                                                               Function<String, TokenFilterFactory> allFilters) {
+        return this;
+    }
+
+    /**
+     * Return a version of this TokenFilterFactory appropriate for synonym parsing
+     *
+     * Filters that should not be applied to synonyms (for example, those that produce
+     * multiple tokens) can return {@link #IDENTITY_FILTER}
+     */
+    default TokenFilterFactory getSynonymFilter() {
+        return this;
+    }
+
+    /**
+     * A TokenFilterFactory that does no filtering to its TokenStream
+     */
+    TokenFilterFactory IDENTITY_FILTER = new TokenFilterFactory() {
+        @Override
+        public String name() {
+            return "identity";
+        }
+
+        @Override
+        public TokenStream create(TokenStream tokenStream) {
+            return tokenStream;
+        }
+    };
 }
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
@ -20,7 +20,6 @@ import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.CustomAnalyzer;
-import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule;
@ -217,6 +216,8 @@ public class CategorizationAnalyzer implements Closeable {
                                                                      Tuple<String, TokenizerFactory> tokenizerFactory,
                                                                      List<CharFilterFactory> charFilterFactoryList) throws IOException {
        List<CategorizationAnalyzerConfig.NameOrDefinition> tokenFilters = config.getTokenFilters();
+        TransportAnalyzeAction.DeferredTokenFilterRegistry deferredRegistry
+            = new TransportAnalyzeAction.DeferredTokenFilterRegistry(analysisRegistry, null);
        final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
        for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
            TokenFilterFactory tokenFilterFactory;
@ -241,8 +242,8 @@ public class CategorizationAnalyzer implements Closeable {
                // Need to set anonymous "name" of token_filter
                tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
                    settings);
-                tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
-                    tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
+                tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(),
+                    charFilterFactoryList, tokenFilterFactoryList, deferredRegistry);
            }
            if (tokenFilterFactory == null) {
                throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");