Multiplexing token filter (#31208)
The `multiplexer` filter emits multiple tokens at the same position, each version of the token haivng been passed through a different filter chain. Identical tokens at the same position are removed. This allows users to, for example, index lowercase and original-case tokens, or stemmed and unstemmed versions, in the same field, so that they can search for a stemmed term within x positions of an unstemmed term.
This commit is contained in:
parent
df10704ffc
commit
5683bc60a6
|
@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]
|
|||
|
||||
include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/stemmer-tokenfilter.asciidoc[]
|
||||
|
||||
include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
[[analysis-multiplexer-tokenfilter]]
|
||||
=== Multiplexer Token Filter
|
||||
|
||||
A token filter of type `multiplexer` will emit multiple tokens at the same position,
|
||||
each version of the token having been run through a different filter. Identical
|
||||
output tokens at the same position will be removed.
|
||||
|
||||
WARNING: If the incoming token stream has duplicate tokens, then these will also be
|
||||
removed by the multiplexer
|
||||
|
||||
[float]
|
||||
=== Options
|
||||
[horizontal]
|
||||
filters:: a list of token filters to apply to incoming tokens. These can be any
|
||||
token filters defined elsewhere in the index mappings. Filters can be chained
|
||||
using a comma-delimited string, so for example `"lowercase, porter_stem"` would
|
||||
apply the `lowercase` filter and then the `porter_stem` filter to a single token.
|
||||
|
||||
WARNING: Shingle or multi-word synonym token filters will not function normally
|
||||
when they are declared in the filters array because they read ahead internally
|
||||
which is unsupported by the multiplexer
|
||||
|
||||
preserve_original:: if `true` (the default) then emit the original token in
|
||||
addition to the filtered tokens
|
||||
|
||||
|
||||
[float]
|
||||
=== Settings example
|
||||
|
||||
You can set it up like:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT /multiplexer_example
|
||||
{
|
||||
"settings" : {
|
||||
"analysis" : {
|
||||
"analyzer" : {
|
||||
"my_analyzer" : {
|
||||
"tokenizer" : "standard",
|
||||
"filter" : [ "my_multiplexer" ]
|
||||
}
|
||||
},
|
||||
"filter" : {
|
||||
"my_multiplexer" : {
|
||||
"type" : "multiplexer",
|
||||
"filters" : [ "lowercase", "lowercase, porter_stem" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
And test it like:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
POST /multiplexer_example/_analyze
|
||||
{
|
||||
"analyzer" : "my_analyzer",
|
||||
"text" : "Going HOME"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
// TEST[continued]
|
||||
|
||||
And it'd respond:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"token": "Going",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "going",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "go",
|
||||
"start_offset": 0,
|
||||
"end_offset": 5,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 0
|
||||
},
|
||||
{
|
||||
"token": "HOME",
|
||||
"start_offset": 6,
|
||||
"end_offset": 10,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
},
|
||||
{
|
||||
"token": "home", <1>
|
||||
"start_offset": 6,
|
||||
"end_offset": 10,
|
||||
"type": "<ALPHANUM>",
|
||||
"position": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
--------------------------------------------------
|
||||
// TESTRESPONSE
|
||||
|
||||
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
|
||||
duplicate of this token it has been removed from the token stream
|
|
@ -226,6 +226,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
|||
filters.put("limit", LimitTokenCountFilterFactory::new);
|
||||
filters.put("lowercase", LowerCaseTokenFilterFactory::new);
|
||||
filters.put("min_hash", MinHashTokenFilterFactory::new);
|
||||
filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
|
||||
filters.put("ngram", NGramTokenFilterFactory::new);
|
||||
filters.put("nGram", NGramTokenFilterFactory::new);
|
||||
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
|
||||
|
||||
private List<TokenFilterFactory> filters;
|
||||
private List<String> filterNames;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "identity";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return tokenStream;
|
||||
}
|
||||
};
|
||||
|
||||
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
|
||||
super(indexSettings, name, settings);
|
||||
this.filterNames = settings.getAsList("filters");
|
||||
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
|
||||
for (TokenFilterFactory tff : filters) {
|
||||
functions.add(tff::create);
|
||||
}
|
||||
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReferences(Map<String, TokenFilterFactory> factories) {
|
||||
filters = new ArrayList<>();
|
||||
if (preserveOriginal) {
|
||||
filters.add(IDENTITY_FACTORY);
|
||||
}
|
||||
for (String filter : filterNames) {
|
||||
String[] parts = Strings.tokenizeToStringArray(filter, ",");
|
||||
if (parts.length == 1) {
|
||||
filters.add(resolveFilterFactory(factories, parts[0]));
|
||||
} else {
|
||||
List<TokenFilterFactory> chain = new ArrayList<>();
|
||||
for (String subfilter : parts) {
|
||||
chain.add(resolveFilterFactory(factories, subfilter));
|
||||
}
|
||||
filters.add(chainFilters(filter, chain));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
for (TokenFilterFactory tff : filters) {
|
||||
tokenStream = tff.create(tokenStream);
|
||||
}
|
||||
return tokenStream;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
|
||||
if (factories.containsKey(name) == false) {
|
||||
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
|
||||
} else {
|
||||
return factories.get(name);
|
||||
}
|
||||
}
|
||||
|
||||
private final class MultiplexTokenFilter extends TokenFilter {
|
||||
|
||||
private final TokenStream source;
|
||||
private final int filterCount;
|
||||
|
||||
private int selector;
|
||||
|
||||
/**
|
||||
* Creates a MultiplexTokenFilter on the given input with a set of filters
|
||||
*/
|
||||
MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
|
||||
super(input);
|
||||
TokenStream source = new MultiplexerFilter(input);
|
||||
for (int i = 0; i < filters.size(); i++) {
|
||||
final int slot = i;
|
||||
source = new ConditionalTokenFilter(source, filters.get(i)) {
|
||||
@Override
|
||||
protected boolean shouldFilter() {
|
||||
return slot == selector;
|
||||
}
|
||||
};
|
||||
}
|
||||
this.source = source;
|
||||
this.filterCount = filters.size();
|
||||
this.selector = filterCount - 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
return source.incrementToken();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
source.end();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
source.reset();
|
||||
}
|
||||
|
||||
private final class MultiplexerFilter extends TokenFilter {
|
||||
|
||||
State state;
|
||||
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private MultiplexerFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (selector >= filterCount - 1) {
|
||||
selector = 0;
|
||||
if (input.incrementToken() == false) {
|
||||
return false;
|
||||
}
|
||||
state = captureState();
|
||||
return true;
|
||||
}
|
||||
restoreState(state);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
selector++;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
selector = filterCount - 1;
|
||||
this.state = null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.env.TestEnvironment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase {
|
||||
|
||||
public void testMultiplexingFilter() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.t.type", "truncate")
|
||||
.put("index.analysis.filter.t.length", "2")
|
||||
.put("index.analysis.filter.multiplexFilter.type", "multiplexer")
|
||||
.putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
|
||||
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
|
||||
|
||||
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
|
||||
"ONe", "on", "ONE", "tHree", "th", "THREE"
|
||||
}, new int[]{
|
||||
1, 0, 0, 1, 0, 0
|
||||
});
|
||||
// Duplicates are removed
|
||||
assertAnalyzesTo(analyzer, "ONe THREE", new String[]{
|
||||
"ONe", "on", "ONE", "THREE", "th"
|
||||
}, new int[]{
|
||||
1, 0, 0, 1, 0, 0
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public void testMultiplexingNoOriginal() throws IOException {
|
||||
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.t.type", "truncate")
|
||||
.put("index.analysis.filter.t.length", "2")
|
||||
.put("index.analysis.filter.multiplexFilter.type", "multiplexer")
|
||||
.put("index.analysis.filter.multiplexFilter.preserve_original", "false")
|
||||
.putList("index.analysis.filter.multiplexFilter.filters", "lowercase, t", "uppercase")
|
||||
.put("index.analysis.analyzer.myAnalyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.myAnalyzer.filter", "multiplexFilter")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(TestEnvironment.newEnvironment(settings),
|
||||
Collections.singletonList(new CommonAnalysisPlugin())).getAnalysisRegistry().build(idxSettings);
|
||||
|
||||
try (NamedAnalyzer analyzer = indexAnalyzers.get("myAnalyzer")) {
|
||||
assertNotNull(analyzer);
|
||||
assertAnalyzesTo(analyzer, "ONe tHree", new String[]{
|
||||
"on", "ONE", "th", "THREE"
|
||||
}, new int[]{
|
||||
1, 0, 1, 0,
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -166,7 +166,18 @@ public final class AnalysisRegistry implements Closeable {
|
|||
*/
|
||||
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
|
||||
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
|
||||
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
||||
|
||||
Map<String, TokenFilterFactory> mappings
|
||||
= buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
||||
|
||||
// ReferringTokenFilters require references to other tokenfilters, so we pass these in
|
||||
// after all factories have been registered
|
||||
for (TokenFilterFactory tff : mappings.values()) {
|
||||
if (tff instanceof ReferringFilterFactory) {
|
||||
((ReferringFilterFactory)tff).setReferences(mappings);
|
||||
}
|
||||
}
|
||||
return mappings;
|
||||
}
|
||||
|
||||
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Marks a {@link TokenFilterFactory} that refers to other filter factories.
|
||||
*
|
||||
* The analysis registry will call {@link #setReferences(Map)} with a map of all
|
||||
* available TokenFilterFactories after all factories have been registered
|
||||
*/
|
||||
public interface ReferringFilterFactory {
|
||||
|
||||
/**
|
||||
* Called with a map of all registered filter factories
|
||||
*/
|
||||
void setReferences(Map<String, TokenFilterFactory> factories);
|
||||
|
||||
}
|
|
@ -20,7 +20,6 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
|
|
Loading…
Reference in New Issue