Multiplexing token filter (#31208)

The `multiplexer` filter emits multiple tokens at the same position, each version of the token haivng been passed through a different filter chain. Identical tokens at the same position are removed. This allows users to, for example, index lowercase and original-case tokens, or stemmed and unstemmed versions, in the same field, so that they can search for a stemmed term within x positions of an unstemmed term.
2018-06-20 10:16:26 +01:00 · 2018-06-20 10:16:26 +01:00 · 5683bc60a6
parent df10704ffc
commit 5683bc60a6
8 changed files with 469 additions and 2 deletions
--- a/docs/reference/analysis/tokenfilters.asciidoc
+++ b/docs/reference/analysis/tokenfilters.asciidoc
@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]

 include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]

+include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
+
 include::tokenfilters/stemmer-tokenfilter.asciidoc[]

 include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
--- a/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
@ -0,0 +1,116 @@
+[[analysis-multiplexer-tokenfilter]]
+=== Multiplexer Token Filter
+
+A token filter of type `multiplexer` will emit multiple tokens at the same position,
+each version of the token having been run through a different filter.  Identical
+output tokens at the same position will be removed.
+
+WARNING: If the incoming token stream has duplicate tokens, then these will also be
+removed by the multiplexer
+
+[float]
+=== Options
+[horizontal]
+filters:: a list of token filters to apply to incoming tokens.  These can be any
+  token filters defined elsewhere in the index mappings.  Filters can be chained
+  using a comma-delimited string, so for example `"lowercase, porter_stem"` would
+  apply the `lowercase` filter and then the `porter_stem` filter to a single token.
+
+WARNING: Shingle or multi-word synonym token filters will not function normally
+  when they are declared in the filters array because they read ahead internally
+  which is unsupported by the multiplexer
+
+preserve_original:: if `true` (the default) then emit the original token in
+  addition to the filtered tokens
+
+
+[float]
+=== Settings example
+
+You can set it up like:
+
+[source,js]
+--------------------------------------------------
+PUT /multiplexer_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "my_multiplexer" ]
+                }
+            },
+            "filter" : {
+                "my_multiplexer" : {
+                    "type" : "multiplexer",
+                    "filters" : [ "lowercase", "lowercase, porter_stem" ]
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// CONSOLE
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /multiplexer_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "Going HOME"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "Going",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "going",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "go",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "HOME",
+      "start_offset": 6,
+      "end_offset": 10,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "home",          <1>
+      "start_offset": 6,
+      "end_offset": 10,
+      "type": "<ALPHANUM>",
+      "position": 1
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+<1> The stemmer has also emitted a token `home` at position 1, but because it is a
+duplicate of this token it has been removed from the token stream
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@ -226,6 +226,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
        filters.put("limit", LimitTokenCountFilterFactory::new);
        filters.put("lowercase", LowerCaseTokenFilterFactory::new);
        filters.put("min_hash", MinHashTokenFilterFactory::new);
+        filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
        filters.put("ngram", NGramTokenFilterFactory::new);
        filters.put("nGram", NGramTokenFilterFactory::new);
        filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java
@ -0,0 +1,195 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
+import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.ReferringFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
+
+    private List<TokenFilterFactory> filters;
+    private List<String> filterNames;
+    private final boolean preserveOriginal;
+
+    private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
+        @Override
+        public String name() {
+            return "identity";
+        }
+
+        @Override
+        public TokenStream create(TokenStream tokenStream) {
+            return tokenStream;
+        }
+    };
+
+    public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
+        super(indexSettings, name, settings);
+        this.filterNames = settings.getAsList("filters");
+        this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
+        for (TokenFilterFactory tff : filters) {
+            functions.add(tff::create);
+        }
+        return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
+    }
+
+    @Override
+    public void setReferences(Map<String, TokenFilterFactory> factories) {
+        filters = new ArrayList<>();
+        if (preserveOriginal) {
+            filters.add(IDENTITY_FACTORY);
+        }
+        for (String filter : filterNames) {
+            String[] parts = Strings.tokenizeToStringArray(filter, ",");
+            if (parts.length == 1) {
+                filters.add(resolveFilterFactory(factories, parts[0]));
+            } else {
+                List<TokenFilterFactory> chain = new ArrayList<>();
+                for (String subfilter : parts) {
+                    chain.add(resolveFilterFactory(factories, subfilter));
+                }
+                filters.add(chainFilters(filter, chain));
+            }
+        }
+    }
+
+    private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return name;
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                for (TokenFilterFactory tff : filters) {
+                    tokenStream = tff.create(tokenStream);
+                }
+                return tokenStream;
+            }
+        };
+    }
+
+    private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
+        if (factories.containsKey(name) == false) {
+            throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
+        } else {
+            return factories.get(name);
+        }
+    }
+
+    private final class MultiplexTokenFilter extends TokenFilter {
+
+        private final TokenStream source;
+        private final int filterCount;
+
+        private int selector;
+
+        /**
+         * Creates a MultiplexTokenFilter on the given input with a set of filters
+         */
+        MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
+            super(input);
+            TokenStream source = new MultiplexerFilter(input);
+            for (int i = 0; i < filters.size(); i++) {
+                final int slot = i;
+                source = new ConditionalTokenFilter(source, filters.get(i)) {
+                    @Override
+                    protected boolean shouldFilter() {
+                        return slot == selector;
+                    }
+                };
+            }
+            this.source = source;
+            this.filterCount = filters.size();
+            this.selector = filterCount - 1;
+        }
+
+        @Override
+        public boolean incrementToken() throws IOException {
+            return source.incrementToken();
+        }
+
+        @Override
+        public void end() throws IOException {
+            source.end();
+        }
+
+        @Override
+        public void reset() throws IOException {
+            source.reset();
+        }
+
+        private final class MultiplexerFilter extends TokenFilter {
+
+            State state;
+            PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+            private MultiplexerFilter(TokenStream input) {
+                super(input);
+            }
+
+            @Override
+            public boolean incrementToken() throws IOException {
+                if (selector >= filterCount - 1) {
+                    selector = 0;
+                    if (input.incrementToken() == false) {
+                        return false;
+                    }
+                    state = captureState();
+                    return true;
+                }
+                restoreState(state);
+                posIncAtt.setPositionIncrement(0);
+                selector++;
+                return true;
+            }
+
+            @Override
+            public void reset() throws IOException {
+                super.reset();
+                selector = filterCount - 1;
+                this.state = null;
+            }
+        }
+
+    }
+}
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterTests.java
@ -0,0 +1,106 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.IndexAnalyzers;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+import org.elasticsearch.test.IndexSettingsModule;
+
+import java.io.IOException;
+import java.util.Collections;
+
+public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase {
+
+    public void testMultiplexingFilter() throws IOException {
+        Settings settings = Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        Settings indexSettings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("index.analysis.filter.t.type", "truncate")
+            .put("index.analysis.filter.t.length", "2")
+            .put("index.analysis.filter.multiplexFilter.type", "multiplexer")
+            .putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
+            .put("index.analysis.analyzer.myAnalyzer.type", "custom")
+            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+
+        IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
+            Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
+
+        try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
+            assertNotNull(analyzer);
+            assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
+                "ONe", "on", "ONE", "tHree", "th", "THREE"
+            }, new int[]{
+                1,      0,      0,      1,      0,      0
+            });
+            // Duplicates are removed
+            assertAnalyzesTo(analyzer, "ONe THREE", new String[]{
+                "ONe", "on", "ONE", "THREE", "th"
+            }, new int[]{
+                1,      0,      0,      1,      0,      0
+            });
+        }
+    }
+
+    public void testMultiplexingNoOriginal() throws IOException {
+
+        Settings settings = Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        Settings indexSettings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("index.analysis.filter.t.type", "truncate")
+            .put("index.analysis.filter.t.length", "2")
+            .put("index.analysis.filter.multiplexFilter.type", "multiplexer")
+            .put("index.analysis.filter.multiplexFilter.preserve_original", "false")
+            .putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
+            .put("index.analysis.analyzer.myAnalyzer.type", "custom")
+            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
+            .putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
+
+        IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
+            Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
+
+        try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
+            assertNotNull(analyzer);
+            assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
+                "on", "ONE", "th", "THREE"
+            }, new int[]{
+                1,      0,      1,      0,
+            });
+        }
+
+    }
+
+}
--- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@ -166,7 +166,18 @@ public final class AnalysisRegistry implements Closeable {
         */
        tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
        tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
-        return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
+
+        Map<String, TokenFilterFactory> mappings
+            = buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
+
+        // ReferringTokenFilters require references to other tokenfilters, so we pass these in
+        // after all factories have been registered
+        for (TokenFilterFactory tff : mappings.values()) {
+            if (tff instanceof ReferringFilterFactory) {
+                ((ReferringFilterFactory)tff).setReferences(mappings);
+            }
+        }
+        return mappings;
    }

    public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
--- a/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java
+++ b/server/src/main/java/org/elasticsearch/index/analysis/ReferringFilterFactory.java
@ -0,0 +1,37 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import java.util.Map;
+
+/**
+ * Marks a {@link TokenFilterFactory} that refers to other filter factories.
+ *
+ * The analysis registry will call {@link #setReferences(Map)} with a map of all
+ * available TokenFilterFactories after all factories have been registered
+ */
+public interface ReferringFilterFactory {
+
+    /**
+     * Called with a map of all registered filter factories
+     */
+    void setReferences(Map<String, TokenFilterFactory> factories);
+
+}
--- a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
+++ b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
@ -20,7 +20,6 @@
 package org.elasticsearch.index.analysis;

 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
-
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;