Allow TokenFilterFactories to rewrite themselves against their preceding chain (#33702)
We currently special-case SynonymFilterFactory and SynonymGraphFilterFactory, which need to know their predecessors in the analysis chain in order to correctly analyze their synonym lists. This special-casing doesn't work with Referring filter factories, such as the Multiplexer or Conditional filters. We also have a number of filters (eg the Multiplexer) that will break synonyms when they appear before them in a chain, because they produce multiple tokens at the same position. This commit adds two methods to the TokenFilterFactory interface. * `getChainAwareTokenFilterFactory()` allows a filter factory to rewrite itself against its preceding filter chain, or to resolve references to other filters. It replaces `ReferringFilterFactory` and `CustomAnalyzerProvider.checkAndApplySynonymFilter`, and by default returns `this`. * `getSynonymFilter()` defines whether or not a filter should be applied when building a synonym list `Analyzer`. By default it returns `true`. Fixes #33609
This commit is contained in:
parent
4190a9f1e9
commit
5107949402
|
@ -114,3 +114,11 @@ And it'd respond:
|
|||
|
||||
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
|
||||
duplicate of this token it has been removed from the token stream
|
||||
|
||||
NOTE: The synonym and synonym_graph filters use their preceding analysis chain to
|
||||
parse and analyse their synonym lists, and ignore any token filters in the chain
|
||||
that produce multiple tokens at the same position. This means that any filters
|
||||
within the multiplexer will be ignored for the purpose of synonyms. If you want to
|
||||
use filters contained within the multiplexer for parsing synonyms (for example, to
|
||||
apply stemming to the synonym lists), then you should append the synonym filter
|
||||
to the relevant multiplexer filter list.
|
|
@ -29,39 +29,64 @@ import org.elasticsearch.common.settings.Settings;
|
|||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
|
||||
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private List<TokenFilterFactory> filters;
|
||||
private List<String> filterNames;
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "identity";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return tokenStream;
|
||||
}
|
||||
};
|
||||
|
||||
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
|
||||
super(indexSettings, name, settings);
|
||||
this.filterNames = settings.getAsList("filters");
|
||||
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
List<TokenFilterFactory> filters = new ArrayList<>();
|
||||
if (preserveOriginal) {
|
||||
filters.add(IDENTITY_FILTER);
|
||||
}
|
||||
for (String filter : filterNames) {
|
||||
String[] parts = Strings.tokenizeToStringArray(filter, ",");
|
||||
if (parts.length == 1) {
|
||||
TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
|
||||
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
|
||||
filters.add(factory);
|
||||
} else {
|
||||
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
|
||||
List<TokenFilterFactory> chain = new ArrayList<>();
|
||||
for (String subfilter : parts) {
|
||||
TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
|
||||
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
|
||||
chain.add(factory);
|
||||
existingChain.add(factory);
|
||||
}
|
||||
filters.add(chainFilters(filter, chain));
|
||||
}
|
||||
}
|
||||
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return MultiplexerTokenFilterFactory.this.name();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
|
||||
|
@ -72,23 +97,10 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setReferences(Map<String, TokenFilterFactory> factories) {
|
||||
filters = new ArrayList<>();
|
||||
if (preserveOriginal) {
|
||||
filters.add(IDENTITY_FACTORY);
|
||||
}
|
||||
for (String filter : filterNames) {
|
||||
String[] parts = Strings.tokenizeToStringArray(filter, ",");
|
||||
if (parts.length == 1) {
|
||||
filters.add(resolveFilterFactory(factories, parts[0]));
|
||||
} else {
|
||||
List<TokenFilterFactory> chain = new ArrayList<>();
|
||||
for (String subfilter : parts) {
|
||||
chain.add(resolveFilterFactory(factories, subfilter));
|
||||
}
|
||||
filters.add(chainFilters(filter, chain));
|
||||
}
|
||||
public TokenFilterFactory getSynonymFilter() {
|
||||
return IDENTITY_FILTER;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
|
||||
|
@ -108,11 +120,12 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
|
|||
};
|
||||
}
|
||||
|
||||
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
|
||||
if (factories.containsKey(name) == false) {
|
||||
private TokenFilterFactory resolveFilterFactory(Function<String, TokenFilterFactory> factories, String name) {
|
||||
TokenFilterFactory factory = factories.apply(name);
|
||||
if (factory == null) {
|
||||
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
|
||||
} else {
|
||||
return factories.get(name);
|
||||
return factory;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,26 +24,24 @@ import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.script.Script;
|
||||
import org.elasticsearch.script.ScriptService;
|
||||
import org.elasticsearch.script.ScriptType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* A factory for a conditional token filter that only applies child filters if the underlying token
|
||||
* matches an {@link AnalysisPredicateScript}
|
||||
*/
|
||||
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
|
||||
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final AnalysisPredicateScript.Factory factory;
|
||||
private final List<TokenFilterFactory> filters = new ArrayList<>();
|
||||
private final List<String> filterNames;
|
||||
|
||||
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
|
||||
|
@ -63,6 +61,34 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
throw new UnsupportedOperationException("getChainAwareTokenFilterFactory should be called first");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
List<TokenFilterFactory> filters = new ArrayList<>();
|
||||
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
|
||||
for (String filter : filterNames) {
|
||||
TokenFilterFactory tff = allFilters.apply(filter);
|
||||
if (tff == null) {
|
||||
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
|
||||
"] refers to undefined token filter [" + filter + "]");
|
||||
}
|
||||
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
|
||||
filters.add(tff);
|
||||
existingChain.add(tff);
|
||||
}
|
||||
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return ScriptedConditionTokenFilterFactory.this.name();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
Function<TokenStream, TokenStream> filter = in -> {
|
||||
|
@ -73,6 +99,8 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
|||
};
|
||||
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
|
||||
|
||||
|
@ -87,22 +115,10 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldFilter() throws IOException {
|
||||
protected boolean shouldFilter() {
|
||||
token.updatePosition();
|
||||
return script.execute(token);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReferences(Map<String, TokenFilterFactory> factories) {
|
||||
for (String filter : filterNames) {
|
||||
TokenFilterFactory tff = factories.get(filter);
|
||||
if (tff == null) {
|
||||
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
|
||||
"] refers to undefined token filter [" + filter + "]");
|
||||
}
|
||||
filters.add(tff);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.Version;
|
||||
|
@ -117,6 +118,26 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testSynonymsWithMultiplexer() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonyms.type", "synonym")
|
||||
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
||||
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||
.put("index.analysis.filter.my_english.language", "porter2")
|
||||
.put("index.analysis.filter.stem_repeat.type", "multiplexer")
|
||||
.putList("index.analysis.filter.stem_repeat.filters", "my_english, synonyms")
|
||||
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "stem_repeat")
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd",
|
||||
new String[]{ "some", "developers", "develop", "programm", "are", "odd" },
|
||||
new int[]{ 1, 1, 0, 0, 1, 1 });
|
||||
}
|
||||
|
||||
private void match(String analyzerName, String source, String target) throws IOException {
|
||||
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
||||
|
|
|
@ -48,11 +48,9 @@ import org.elasticsearch.index.IndexSettings;
|
|||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||
|
@ -66,6 +64,7 @@ import org.elasticsearch.transport.TransportService;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -73,6 +72,7 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* Transport action used to execute analyze requests
|
||||
|
@ -571,11 +571,48 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
return charFilterFactoryList;
|
||||
}
|
||||
|
||||
public static class DeferredTokenFilterRegistry implements Function<String, TokenFilterFactory> {
|
||||
|
||||
private final AnalysisRegistry analysisRegistry;
|
||||
private final IndexSettings indexSettings;
|
||||
Map<String, TokenFilterFactory> prebuiltFilters;
|
||||
|
||||
public DeferredTokenFilterRegistry(AnalysisRegistry analysisRegistry, IndexSettings indexSettings) {
|
||||
this.analysisRegistry = analysisRegistry;
|
||||
if (indexSettings == null) {
|
||||
// Settings are null when _analyze is called with no index name, so
|
||||
// we create dummy settings which will make prebuilt analysis components
|
||||
// available
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||
.build();
|
||||
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
||||
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
|
||||
}
|
||||
this.indexSettings = indexSettings;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory apply(String s) {
|
||||
if (prebuiltFilters == null) {
|
||||
try {
|
||||
prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
return prebuiltFilters.get(s);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
||||
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||
List<CharFilterFactory> charFilterFactoryList, boolean normalizer) throws IOException {
|
||||
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||
List<ReferringFilterFactory> referringFilters = new ArrayList<>();
|
||||
DeferredTokenFilterRegistry deferredRegistry = new DeferredTokenFilterRegistry(analysisRegistry, indexSettings);
|
||||
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
|
||||
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
|
||||
for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
|
||||
|
@ -594,11 +631,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
}
|
||||
// Need to set anonymous "name" of tokenfilter
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
|
||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
|
||||
charFilterFactoryList, environment);
|
||||
if (tokenFilterFactory instanceof ReferringFilterFactory) {
|
||||
referringFilters.add((ReferringFilterFactory)tokenFilterFactory);
|
||||
}
|
||||
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
|
||||
tokenFilterFactoryList, deferredRegistry);
|
||||
|
||||
} else {
|
||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
|
||||
|
@ -616,8 +650,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
|
||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
|
||||
charFilterFactoryList, environment);
|
||||
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
|
||||
tokenFilterFactoryList, deferredRegistry);
|
||||
}
|
||||
}
|
||||
if (tokenFilterFactory == null) {
|
||||
|
@ -633,26 +667,6 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
tokenFilterFactoryList.add(tokenFilterFactory);
|
||||
}
|
||||
}
|
||||
if (referringFilters.isEmpty() == false) {
|
||||
// The request included at least one custom referring tokenfilter that has not already been built by the
|
||||
// analysis registry, so we need to set its references. Note that this will only apply pre-built
|
||||
// tokenfilters
|
||||
if (indexSettings == null) {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||
.build();
|
||||
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
||||
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
|
||||
}
|
||||
Map<String, TokenFilterFactory> prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
|
||||
for (ReferringFilterFactory rff : referringFilters) {
|
||||
rff.setReferences(prebuiltFilters);
|
||||
}
|
||||
|
||||
}
|
||||
return tokenFilterFactoryList;
|
||||
}
|
||||
|
||||
|
|
|
@ -167,17 +167,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
|
||||
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
|
||||
|
||||
Map<String, TokenFilterFactory> mappings
|
||||
= buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
||||
|
||||
// ReferringTokenFilters require references to other tokenfilters, so we pass these in
|
||||
// after all factories have been registered
|
||||
for (TokenFilterFactory tff : mappings.values()) {
|
||||
if (tff instanceof ReferringFilterFactory) {
|
||||
((ReferringFilterFactory)tff).setReferences(mappings);
|
||||
}
|
||||
}
|
||||
return mappings;
|
||||
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
||||
}
|
||||
|
||||
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
|
||||
|
|
|
@ -81,9 +81,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
|||
if (tokenFilter == null) {
|
||||
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
|
||||
}
|
||||
// no need offsetGap for tokenize synonyms
|
||||
tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
|
||||
this.environment);
|
||||
tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(tokenizer, charFiltersList, tokenFilterList, tokenFilters::get);
|
||||
tokenFilterList.add(tokenFilter);
|
||||
}
|
||||
|
||||
|
@ -95,33 +93,6 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
|||
);
|
||||
}
|
||||
|
||||
public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
|
||||
List<TokenFilterFactory> tokenFilterList,
|
||||
List<CharFilterFactory> charFiltersList, Environment env) {
|
||||
if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
|
||||
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
|
||||
|
||||
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
|
||||
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
|
||||
-1)){
|
||||
tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
|
||||
}
|
||||
|
||||
} else if (tokenFilter instanceof SynonymTokenFilterFactory) {
|
||||
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
|
||||
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
|
||||
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
|
||||
-1)) {
|
||||
tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
|
||||
}
|
||||
}
|
||||
return tokenFilter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CustomAnalyzer get() {
|
||||
return this.customAnalyzer;
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Marks a {@link TokenFilterFactory} that refers to other filter factories.
|
||||
*
|
||||
* The analysis registry will call {@link #setReferences(Map)} with a map of all
|
||||
* available TokenFilterFactories after all factories have been registered
|
||||
*/
|
||||
public interface ReferringFilterFactory {
|
||||
|
||||
/**
|
||||
* Called with a map of all registered filter factories
|
||||
*/
|
||||
void setReferences(Map<String, TokenFilterFactory> factories);
|
||||
|
||||
}
|
|
@ -28,9 +28,11 @@ import org.elasticsearch.env.Environment;
|
|||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
||||
|
||||
public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
||||
String name, Settings settings) throws IOException {
|
||||
super(indexSettings, env, analysisRegistry, name, settings);
|
||||
|
@ -41,42 +43,24 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
|||
throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
|
||||
}
|
||||
|
||||
Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
|
||||
return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
|
||||
}
|
||||
|
||||
public class Factory implements TokenFilterFactory{
|
||||
|
||||
private final String name;
|
||||
private final SynonymMap synonymMap;
|
||||
|
||||
public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
|
||||
this.name = name;
|
||||
|
||||
try {
|
||||
SynonymMap.Builder parser;
|
||||
if ("wordnet".equalsIgnoreCase(format)) {
|
||||
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
||||
((ESWordnetSynonymParser) parser).parse(rulesReader);
|
||||
} else {
|
||||
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
||||
((ESSolrSynonymParser) parser).parse(rulesReader);
|
||||
}
|
||||
synonymMap = parser.build();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
|
||||
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||
final String name = name();
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return this.name;
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, false);
|
||||
return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
|
@ -38,6 +39,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
protected final boolean expand;
|
||||
protected final boolean lenient;
|
||||
protected final Settings settings;
|
||||
protected final Environment environment;
|
||||
|
||||
public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
||||
String name, Settings settings) throws IOException {
|
||||
|
@ -53,6 +55,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
this.expand = settings.getAsBoolean("expand", true);
|
||||
this.lenient = settings.getAsBoolean("lenient", false);
|
||||
this.format = settings.get("format", "");
|
||||
this.environment = env;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -60,6 +63,50 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
|
||||
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||
final String name = name();
|
||||
return new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> tokenFilters) {
|
||||
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
|
||||
tokenFilters.stream()
|
||||
.map(TokenFilterFactory::getSynonymFilter)
|
||||
.toArray(TokenFilterFactory[]::new));
|
||||
}
|
||||
|
||||
protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
|
||||
try {
|
||||
SynonymMap.Builder parser;
|
||||
if ("wordnet".equalsIgnoreCase(format)) {
|
||||
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
|
||||
((ESWordnetSynonymParser) parser).parse(rules);
|
||||
} else {
|
||||
parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
|
||||
((ESSolrSynonymParser) parser).parse(rules);
|
||||
}
|
||||
return parser.build();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||
}
|
||||
}
|
||||
|
||||
protected Reader getRulesFromSettings(Environment env) {
|
||||
Reader rulesReader;
|
||||
if (settings.getAsList("synonyms", null) != null) {
|
||||
|
@ -77,44 +124,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return rulesReader;
|
||||
}
|
||||
|
||||
Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
|
||||
return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
|
||||
}
|
||||
|
||||
public class Factory implements TokenFilterFactory{
|
||||
|
||||
private final String name;
|
||||
private final SynonymMap synonymMap;
|
||||
|
||||
public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
|
||||
|
||||
this.name = name;
|
||||
|
||||
try {
|
||||
SynonymMap.Builder parser;
|
||||
if ("wordnet".equalsIgnoreCase(format)) {
|
||||
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
||||
((ESWordnetSynonymParser) parser).parse(rulesReader);
|
||||
} else {
|
||||
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
||||
((ESSolrSynonymParser) parser).parse(rulesReader);
|
||||
}
|
||||
synonymMap = parser.build();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
public interface TokenFilterFactory {
|
||||
String name();
|
||||
|
||||
|
@ -36,4 +39,43 @@ public interface TokenFilterFactory {
|
|||
default boolean breaksFastVectorHighlighter() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewrite the TokenFilterFactory to take into account the preceding analysis chain, or refer
|
||||
* to other TokenFilterFactories
|
||||
* @param tokenizer the TokenizerFactory for the preceding chain
|
||||
* @param charFilters any CharFilterFactories for the preceding chain
|
||||
* @param previousTokenFilters a list of TokenFilterFactories in the preceding chain
|
||||
* @param allFilters access to previously defined TokenFilterFactories
|
||||
*/
|
||||
default TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> previousTokenFilters,
|
||||
Function<String, TokenFilterFactory> allFilters) {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a version of this TokenFilterFactory appropriate for synonym parsing
|
||||
*
|
||||
* Filters that should not be applied to synonyms (for example, those that produce
|
||||
* multiple tokens) can return {@link #IDENTITY_FILTER}
|
||||
*/
|
||||
default TokenFilterFactory getSynonymFilter() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* A TokenFilterFactory that does no filtering to its TokenStream
|
||||
*/
|
||||
TokenFilterFactory IDENTITY_FILTER = new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "identity";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return tokenStream;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ import org.elasticsearch.index.IndexSettings;
|
|||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
|
@ -217,6 +216,8 @@ public class CategorizationAnalyzer implements Closeable {
|
|||
Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||
List<CharFilterFactory> charFilterFactoryList) throws IOException {
|
||||
List<CategorizationAnalyzerConfig.NameOrDefinition> tokenFilters = config.getTokenFilters();
|
||||
TransportAnalyzeAction.DeferredTokenFilterRegistry deferredRegistry
|
||||
= new TransportAnalyzeAction.DeferredTokenFilterRegistry(analysisRegistry, null);
|
||||
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||
for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
|
||||
TokenFilterFactory tokenFilterFactory;
|
||||
|
@ -241,8 +242,8 @@ public class CategorizationAnalyzer implements Closeable {
|
|||
// Need to set anonymous "name" of token_filter
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
|
||||
settings);
|
||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
|
||||
tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
|
||||
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(),
|
||||
charFilterFactoryList, tokenFilterFactoryList, deferredRegistry);
|
||||
}
|
||||
if (tokenFilterFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
|
||||
|
|
Loading…
Reference in New Issue