mirror of
synced 2025-03-25 09:28:27 +00:00
Multiplexing token filter (#31208)
The `multiplexer` filter emits multiple tokens at the same position, each version of the token haivng been passed through a different filter chain. Identical tokens at the same position are removed. This allows users to, for example, index lowercase and original-case tokens, or stemmed and unstemmed versions, in the same field, so that they can search for a stemmed term within x positions of an unstemmed term.
This commit is contained in:
@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]
@ -0,0 +1,116 @@
=== Multiplexer Token Filter
A token filter of type `multiplexer` will emit multiple tokens at the same position,
each version of the token having been run through a different filter. Identical
output tokens at the same position will be removed.
WARNING: If the incoming token stream has duplicate tokens, then these will also be
removed by the multiplexer
=== Options
filters:: a list of token filters to apply to incoming tokens. These can be any
token filters defined elsewhere in the index mappings. Filters can be chained
using a comma-delimited string, so for example `"lowercase, porter_stem"` would
apply the `lowercase` filter and then the `porter_stem` filter to a single token.
WARNING: Shingle or multi-word synonym token filters will not function normally
when they are declared in the filters array because they read ahead internally
which is unsupported by the multiplexer
preserve_original:: if `true` (the default) then emit the original token in
addition to the filtered tokens
=== Settings example
You can set it up like:
PUT /multiplexer_example
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : [ "my_multiplexer" ]
"filter" : {
"my_multiplexer" : {
"type" : "multiplexer",
"filters" : [ "lowercase", "lowercase, porter_stem" ]
And test it like:
POST /multiplexer_example/_analyze
"analyzer" : "my_analyzer",
"text" : "Going HOME"
// TEST[continued]
And it'd respond:
"tokens": [
"token": "Going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
"token": "going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
"token": "go",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
"token": "HOME",
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
"token": "home", <1>
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
duplicate of this token it has been removed from the token stream
@ -226,6 +226,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.put("limit", LimitTokenCountFilterFactory::new);
filters.put("lowercase", LowerCaseTokenFilterFactory::new);
filters.put("min_hash", MinHashTokenFilterFactory::new);
filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("nGram", NGramTokenFilterFactory::new);
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
@ -0,0 +1,195 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
private List<TokenFilterFactory> filters;
private List<String> filterNames;
private final boolean preserveOriginal;
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
public String name() {
return "identity";
public TokenStream create(TokenStream tokenStream) {
return tokenStream;
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
this.filterNames = settings.getAsList("filters");
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
public TokenStream create(TokenStream tokenStream) {
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
for (TokenFilterFactory tff : filters) {
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
public void setReferences(Map<String, TokenFilterFactory> factories) {
filters = new ArrayList<>();
if (preserveOriginal) {
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
filters.add(resolveFilterFactory(factories, parts[0]));
} else {
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
chain.add(resolveFilterFactory(factories, subfilter));
filters.add(chainFilters(filter, chain));
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
return new TokenFilterFactory() {
public String name() {
return name;
public TokenStream create(TokenStream tokenStream) {
for (TokenFilterFactory tff : filters) {
tokenStream = tff.create(tokenStream);
return tokenStream;
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
if (factories.containsKey(name) == false) {
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
} else {
return factories.get(name);
private final class MultiplexTokenFilter extends TokenFilter {
private final TokenStream source;
private final int filterCount;
private int selector;
* Creates a MultiplexTokenFilter on the given input with a set of filters
MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
TokenStream source = new MultiplexerFilter(input);
for (int i = 0; i < filters.size(); i++) {
final int slot = i;
source = new ConditionalTokenFilter(source, filters.get(i)) {
protected boolean shouldFilter() {
return slot == selector;
this.source = source;
this.filterCount = filters.size();
this.selector = filterCount - 1;
public boolean incrementToken() throws IOException {
return source.incrementToken();
public void end() throws IOException {
public void reset() throws IOException {
private final class MultiplexerFilter extends TokenFilter {
State state;
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private MultiplexerFilter(TokenStream input) {
public boolean incrementToken() throws IOException {
if (selector >= filterCount - 1) {
selector = 0;
if (input.incrementToken() == false) {
return false;
state = captureState();
return true;
return true;
public void reset() throws IOException {
selector = filterCount - 1;
this.state = null;
@ -0,0 +1,106 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.analysis.common;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.TestEnvironment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException;
import java.util.Collections;
public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase {
public void testMultiplexingFilter() throws IOException {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
Settings indexSettings = Settings.builder()
.put("index.analysis.filter.t.type", "truncate")
.put("index.analysis.filter.t.length", "2")
.put("index.analysis.filter.multiplexFilter.type", "multiplexer")
.putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
"ONe", "on", "ONE", "tHree", "th", "THREE"
}, new int[]{
1, 0, 0, 1, 0, 0
// Duplicates are removed
assertAnalyzesTo(analyzer, "ONe THREE", new String[]{
"ONe", "on", "ONE", "THREE", "th"
}, new int[]{
1, 0, 0, 1, 0, 0
public void testMultiplexingNoOriginal() throws IOException {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
Settings indexSettings = Settings.builder()
.put("index.analysis.filter.t.type", "truncate")
.put("index.analysis.filter.t.length", "2")
.put("index.analysis.filter.multiplexFilter.type", "multiplexer")
.put("index.analysis.filter.multiplexFilter.preserve_original", "false")
.putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
"on", "ONE", "th", "THREE"
}, new int[]{
1, 0, 1, 0,
@ -166,7 +166,18 @@ public final class AnalysisRegistry implements Closeable {
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
Map<String, TokenFilterFactory> mappings
= buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
// ReferringTokenFilters require references to other tokenfilters, so we pass these in
// after all factories have been registered
for (TokenFilterFactory tff : mappings.values()) {
if (tff instanceof ReferringFilterFactory) {
return mappings;
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
@ -0,0 +1,37 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis;
import java.util.Map;
* Marks a {@link TokenFilterFactory} that refers to other filter factories.
* The analysis registry will call {@link #setReferences(Map)} with a map of all
* available TokenFilterFactories after all factories have been registered
public interface ReferringFilterFactory {
* Called with a map of all registered filter factories
void setReferences(Map<String, TokenFilterFactory> factories);
@ -20,7 +20,6 @@
package org.elasticsearch.index.analysis;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
Reference in New Issue
Block a user