Multiplexing token filter (#31208)

The `multiplexer` filter emits multiple tokens at the same position, each 
version of the token haivng been passed through a different filter chain.
Identical tokens at the same position are removed.

This allows users to, for example, index lowercase and original-case tokens,
or stemmed and unstemmed versions, in the same field, so that they can search
for a stemmed term within x positions of an unstemmed term.
This commit is contained in:
Alan Woodward 2018-06-20 10:16:26 +01:00 committed by GitHub
parent df10704ffc
commit 5683bc60a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 469 additions and 2 deletions

View File

@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]
include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]
include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
include::tokenfilters/stemmer-tokenfilter.asciidoc[]
include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]

View File

@ -0,0 +1,116 @@
[[analysis-multiplexer-tokenfilter]]
=== Multiplexer Token Filter
A token filter of type `multiplexer` will emit multiple tokens at the same position,
each version of the token having been run through a different filter. Identical
output tokens at the same position will be removed.
WARNING: If the incoming token stream has duplicate tokens, then these will also be
removed by the multiplexer
[float]
=== Options
[horizontal]
filters:: a list of token filters to apply to incoming tokens. These can be any
token filters defined elsewhere in the index mappings. Filters can be chained
using a comma-delimited string, so for example `"lowercase, porter_stem"` would
apply the `lowercase` filter and then the `porter_stem` filter to a single token.
WARNING: Shingle or multi-word synonym token filters will not function normally
when they are declared in the filters array because they read ahead internally
which is unsupported by the multiplexer
preserve_original:: if `true` (the default) then emit the original token in
addition to the filtered tokens
[float]
=== Settings example
You can set it up like:
[source,js]
--------------------------------------------------
PUT /multiplexer_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : [ "my_multiplexer" ]
}
},
"filter" : {
"my_multiplexer" : {
"type" : "multiplexer",
"filters" : [ "lowercase", "lowercase, porter_stem" ]
}
}
}
}
}
--------------------------------------------------
// CONSOLE
And test it like:
[source,js]
--------------------------------------------------
POST /multiplexer_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "Going HOME"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "Going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "go",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "HOME",
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "home", <1>
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
}
]
}
--------------------------------------------------
// TESTRESPONSE
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
duplicate of this token it has been removed from the token stream

View File

@ -226,6 +226,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.put("limit", LimitTokenCountFilterFactory::new);
filters.put("lowercase", LowerCaseTokenFilterFactory::new);
filters.put("min_hash", MinHashTokenFilterFactory::new);
filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("nGram", NGramTokenFilterFactory::new);
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));

View File

@ -0,0 +1,195 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
private List<TokenFilterFactory> filters;
private List<String> filterNames;
private final boolean preserveOriginal;
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
@Override
public String name() {
return "identity";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return tokenStream;
}
};
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
this.filterNames = settings.getAsList("filters");
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
}
@Override
public TokenStream create(TokenStream tokenStream) {
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
for (TokenFilterFactory tff : filters) {
functions.add(tff::create);
}
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
}
@Override
public void setReferences(Map<String, TokenFilterFactory> factories) {
filters = new ArrayList<>();
if (preserveOriginal) {
filters.add(IDENTITY_FACTORY);
}
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
filters.add(resolveFilterFactory(factories, parts[0]));
} else {
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
chain.add(resolveFilterFactory(factories, subfilter));
}
filters.add(chainFilters(filter, chain));
}
}
}
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
return new TokenFilterFactory() {
@Override
public String name() {
return name;
}
@Override
public TokenStream create(TokenStream tokenStream) {
for (TokenFilterFactory tff : filters) {
tokenStream = tff.create(tokenStream);
}
return tokenStream;
}
};
}
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
if (factories.containsKey(name) == false) {
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
} else {
return factories.get(name);
}
}
private final class MultiplexTokenFilter extends TokenFilter {
private final TokenStream source;
private final int filterCount;
private int selector;
/**
* Creates a MultiplexTokenFilter on the given input with a set of filters
*/
MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
super(input);
TokenStream source = new MultiplexerFilter(input);
for (int i = 0; i < filters.size(); i++) {
final int slot = i;
source = new ConditionalTokenFilter(source, filters.get(i)) {
@Override
protected boolean shouldFilter() {
return slot == selector;
}
};
}
this.source = source;
this.filterCount = filters.size();
this.selector = filterCount - 1;
}
@Override
public boolean incrementToken() throws IOException {
return source.incrementToken();
}
@Override
public void end() throws IOException {
source.end();
}
@Override
public void reset() throws IOException {
source.reset();
}
private final class MultiplexerFilter extends TokenFilter {
State state;
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private MultiplexerFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (selector >= filterCount - 1) {
selector = 0;
if (input.incrementToken() == false) {
return false;
}
state = captureState();
return true;
}
restoreState(state);
posIncAtt.setPositionIncrement(0);
selector++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
selector = filterCount - 1;
this.state = null;
}
}
}
}

View File

@ -0,0 +1,106 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.TestEnvironment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException;
import java.util.Collections;
public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase {
public void testMultiplexingFilter() throws IOException {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.t.type", "truncate")
.put("index.analysis.filter.t.length", "2")
.put("index.analysis.filter.multiplexFilter.type", "multiplexer")
.putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
assertNotNull(analyzer);
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
"ONe", "on", "ONE", "tHree", "th", "THREE"
}, new int[]{
1, 0, 0, 1, 0, 0
});
// Duplicates are removed
assertAnalyzesTo(analyzer, "ONe THREE", new String[]{
"ONe", "on", "ONE", "THREE", "th"
}, new int[]{
1, 0, 0, 1, 0, 0
});
}
}
public void testMultiplexingNoOriginal() throws IOException {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.t.type", "truncate")
.put("index.analysis.filter.t.length", "2")
.put("index.analysis.filter.multiplexFilter.type", "multiplexer")
.put("index.analysis.filter.multiplexFilter.preserve_original", "false")
.putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
assertNotNull(analyzer);
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
"on", "ONE", "th", "THREE"
}, new int[]{
1, 0, 1, 0,
});
}
}
}

View File

@ -166,7 +166,18 @@ public final class AnalysisRegistry implements Closeable {
*/
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
Map<String, TokenFilterFactory> mappings
= buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
// ReferringTokenFilters require references to other tokenfilters, so we pass these in
// after all factories have been registered
for (TokenFilterFactory tff : mappings.values()) {
if (tff instanceof ReferringFilterFactory) {
((ReferringFilterFactory)tff).setReferences(mappings);
}
}
return mappings;
}
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {

View File

@ -0,0 +1,37 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import java.util.Map;
/**
* Marks a {@link TokenFilterFactory} that refers to other filter factories.
*
* The analysis registry will call {@link #setReferences(Map)} with a map of all
* available TokenFilterFactories after all factories have been registered
*/
public interface ReferringFilterFactory {
/**
* Called with a map of all registered filter factories
*/
void setReferences(Map<String, TokenFilterFactory> factories);
}

View File

@ -20,7 +20,6 @@
package org.elasticsearch.index.analysis;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;