Allow TokenFilterFactories to rewrite themselves against their preceding chain (#33702)
We currently special-case SynonymFilterFactory and SynonymGraphFilterFactory, which need to know their predecessors in the analysis chain in order to correctly analyze their synonym lists. This special-casing doesn't work with Referring filter factories, such as the Multiplexer or Conditional filters. We also have a number of filters (eg the Multiplexer) that will break synonyms when they appear before them in a chain, because they produce multiple tokens at the same position. This commit adds two methods to the TokenFilterFactory interface. * `getChainAwareTokenFilterFactory()` allows a filter factory to rewrite itself against its preceding filter chain, or to resolve references to other filters. It replaces `ReferringFilterFactory` and `CustomAnalyzerProvider.checkAndApplySynonymFilter`, and by default returns `this`. * `getSynonymFilter()` defines whether or not a filter should be applied when building a synonym list `Analyzer`. By default it returns `true`. Fixes #33609
This commit is contained in:
parent
4190a9f1e9
commit
5107949402
|
@ -113,4 +113,12 @@ And it'd respond:
|
||||||
// TESTRESPONSE
|
// TESTRESPONSE
|
||||||
|
|
||||||
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
|
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
|
||||||
duplicate of this token it has been removed from the token stream
|
duplicate of this token it has been removed from the token stream
|
||||||
|
|
||||||
|
NOTE: The synonym and synonym_graph filters use their preceding analysis chain to
|
||||||
|
parse and analyse their synonym lists, and ignore any token filters in the chain
|
||||||
|
that produce multiple tokens at the same position. This means that any filters
|
||||||
|
within the multiplexer will be ignored for the purpose of synonyms. If you want to
|
||||||
|
use filters contained within the multiplexer for parsing synonyms (for example, to
|
||||||
|
apply stemming to the synonym lists), then you should append the synonym filter
|
||||||
|
to the relevant multiplexer filter list.
|
|
@ -29,33 +29,20 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
|
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
private List<TokenFilterFactory> filters;
|
|
||||||
private List<String> filterNames;
|
private List<String> filterNames;
|
||||||
private final boolean preserveOriginal;
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
|
|
||||||
@Override
|
|
||||||
public String name() {
|
|
||||||
return "identity";
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
|
||||||
return tokenStream;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
|
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
this.filterNames = settings.getAsList("filters");
|
this.filterNames = settings.getAsList("filters");
|
||||||
|
@ -64,31 +51,56 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
|
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
|
||||||
for (TokenFilterFactory tff : filters) {
|
|
||||||
functions.add(tff::create);
|
|
||||||
}
|
|
||||||
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setReferences(Map<String, TokenFilterFactory> factories) {
|
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||||
filters = new ArrayList<>();
|
List<TokenFilterFactory> previousTokenFilters,
|
||||||
|
Function<String, TokenFilterFactory> allFilters) {
|
||||||
|
List<TokenFilterFactory> filters = new ArrayList<>();
|
||||||
if (preserveOriginal) {
|
if (preserveOriginal) {
|
||||||
filters.add(IDENTITY_FACTORY);
|
filters.add(IDENTITY_FILTER);
|
||||||
}
|
}
|
||||||
for (String filter : filterNames) {
|
for (String filter : filterNames) {
|
||||||
String[] parts = Strings.tokenizeToStringArray(filter, ",");
|
String[] parts = Strings.tokenizeToStringArray(filter, ",");
|
||||||
if (parts.length == 1) {
|
if (parts.length == 1) {
|
||||||
filters.add(resolveFilterFactory(factories, parts[0]));
|
TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
|
||||||
|
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
|
||||||
|
filters.add(factory);
|
||||||
} else {
|
} else {
|
||||||
|
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
|
||||||
List<TokenFilterFactory> chain = new ArrayList<>();
|
List<TokenFilterFactory> chain = new ArrayList<>();
|
||||||
for (String subfilter : parts) {
|
for (String subfilter : parts) {
|
||||||
chain.add(resolveFilterFactory(factories, subfilter));
|
TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
|
||||||
|
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
|
||||||
|
chain.add(factory);
|
||||||
|
existingChain.add(factory);
|
||||||
}
|
}
|
||||||
filters.add(chainFilters(filter, chain));
|
filters.add(chainFilters(filter, chain));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return MultiplexerTokenFilterFactory.this.name();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
|
||||||
|
for (TokenFilterFactory tff : filters) {
|
||||||
|
functions.add(tff::create);
|
||||||
|
}
|
||||||
|
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenFilterFactory getSynonymFilter() {
|
||||||
|
return IDENTITY_FILTER;
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
|
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
|
||||||
|
@ -108,11 +120,12 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
|
private TokenFilterFactory resolveFilterFactory(Function<String, TokenFilterFactory> factories, String name) {
|
||||||
if (factories.containsKey(name) == false) {
|
TokenFilterFactory factory = factories.apply(name);
|
||||||
|
if (factory == null) {
|
||||||
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
|
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
|
||||||
} else {
|
} else {
|
||||||
return factories.get(name);
|
return factory;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,26 +24,24 @@ import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
import org.elasticsearch.script.Script;
|
import org.elasticsearch.script.Script;
|
||||||
import org.elasticsearch.script.ScriptService;
|
import org.elasticsearch.script.ScriptService;
|
||||||
import org.elasticsearch.script.ScriptType;
|
import org.elasticsearch.script.ScriptType;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A factory for a conditional token filter that only applies child filters if the underlying token
|
* A factory for a conditional token filter that only applies child filters if the underlying token
|
||||||
* matches an {@link AnalysisPredicateScript}
|
* matches an {@link AnalysisPredicateScript}
|
||||||
*/
|
*/
|
||||||
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
|
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
private final AnalysisPredicateScript.Factory factory;
|
private final AnalysisPredicateScript.Factory factory;
|
||||||
private final List<TokenFilterFactory> filters = new ArrayList<>();
|
|
||||||
private final List<String> filterNames;
|
private final List<String> filterNames;
|
||||||
|
|
||||||
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
|
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
|
||||||
|
@ -65,13 +63,43 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
Function<TokenStream, TokenStream> filter = in -> {
|
throw new UnsupportedOperationException("getChainAwareTokenFilterFactory should be called first");
|
||||||
for (TokenFilterFactory tff : filters) {
|
}
|
||||||
in = tff.create(in);
|
|
||||||
|
@Override
|
||||||
|
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||||
|
List<TokenFilterFactory> previousTokenFilters,
|
||||||
|
Function<String, TokenFilterFactory> allFilters) {
|
||||||
|
List<TokenFilterFactory> filters = new ArrayList<>();
|
||||||
|
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
|
||||||
|
for (String filter : filterNames) {
|
||||||
|
TokenFilterFactory tff = allFilters.apply(filter);
|
||||||
|
if (tff == null) {
|
||||||
|
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
|
||||||
|
"] refers to undefined token filter [" + filter + "]");
|
||||||
|
}
|
||||||
|
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
|
||||||
|
filters.add(tff);
|
||||||
|
existingChain.add(tff);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return ScriptedConditionTokenFilterFactory.this.name();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
Function<TokenStream, TokenStream> filter = in -> {
|
||||||
|
for (TokenFilterFactory tff : filters) {
|
||||||
|
in = tff.create(in);
|
||||||
|
}
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
|
||||||
}
|
}
|
||||||
return in;
|
|
||||||
};
|
};
|
||||||
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
|
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
|
||||||
|
@ -80,29 +108,17 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
|
||||||
private final AnalysisPredicateScript.Token token;
|
private final AnalysisPredicateScript.Token token;
|
||||||
|
|
||||||
ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
|
ScriptedConditionTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory,
|
||||||
AnalysisPredicateScript script) {
|
AnalysisPredicateScript script) {
|
||||||
super(input, inputFactory);
|
super(input, inputFactory);
|
||||||
this.script = script;
|
this.script = script;
|
||||||
this.token = new AnalysisPredicateScript.Token(this);
|
this.token = new AnalysisPredicateScript.Token(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean shouldFilter() throws IOException {
|
protected boolean shouldFilter() {
|
||||||
token.updatePosition();
|
token.updatePosition();
|
||||||
return script.execute(token);
|
return script.execute(token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setReferences(Map<String, TokenFilterFactory> factories) {
|
|
||||||
for (String filter : filterNames) {
|
|
||||||
TokenFilterFactory tff = factories.get(filter);
|
|
||||||
if (tff == null) {
|
|
||||||
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
|
|
||||||
"] refers to undefined token filter [" + filter + "]");
|
|
||||||
}
|
|
||||||
filters.add(tff);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
package org.elasticsearch.analysis.common;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
|
@ -117,6 +118,26 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSynonymsWithMultiplexer() throws IOException {
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
|
.put("path.home", createTempDir().toString())
|
||||||
|
.put("index.analysis.filter.synonyms.type", "synonym")
|
||||||
|
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
||||||
|
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||||
|
.put("index.analysis.filter.my_english.language", "porter2")
|
||||||
|
.put("index.analysis.filter.stem_repeat.type", "multiplexer")
|
||||||
|
.putList("index.analysis.filter.stem_repeat.filters", "my_english, synonyms")
|
||||||
|
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
|
||||||
|
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "stem_repeat")
|
||||||
|
.build();
|
||||||
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||||
|
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||||
|
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd",
|
||||||
|
new String[]{ "some", "developers", "develop", "programm", "are", "odd" },
|
||||||
|
new int[]{ 1, 1, 0, 0, 1, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
private void match(String analyzerName, String source, String target) throws IOException {
|
private void match(String analyzerName, String source, String target) throws IOException {
|
||||||
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
||||||
|
|
|
@ -48,11 +48,9 @@ import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
|
|
||||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.ReferringFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||||
|
@ -66,6 +64,7 @@ import org.elasticsearch.transport.TransportService;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.io.UncheckedIOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -73,6 +72,7 @@ import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Transport action used to execute analyze requests
|
* Transport action used to execute analyze requests
|
||||||
|
@ -571,11 +571,48 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
||||||
return charFilterFactoryList;
|
return charFilterFactoryList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class DeferredTokenFilterRegistry implements Function<String, TokenFilterFactory> {
|
||||||
|
|
||||||
|
private final AnalysisRegistry analysisRegistry;
|
||||||
|
private final IndexSettings indexSettings;
|
||||||
|
Map<String, TokenFilterFactory> prebuiltFilters;
|
||||||
|
|
||||||
|
public DeferredTokenFilterRegistry(AnalysisRegistry analysisRegistry, IndexSettings indexSettings) {
|
||||||
|
this.analysisRegistry = analysisRegistry;
|
||||||
|
if (indexSettings == null) {
|
||||||
|
// Settings are null when _analyze is called with no index name, so
|
||||||
|
// we create dummy settings which will make prebuilt analysis components
|
||||||
|
// available
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
|
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
||||||
|
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||||
|
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||||
|
.build();
|
||||||
|
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
||||||
|
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
|
||||||
|
}
|
||||||
|
this.indexSettings = indexSettings;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenFilterFactory apply(String s) {
|
||||||
|
if (prebuiltFilters == null) {
|
||||||
|
try {
|
||||||
|
prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new UncheckedIOException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prebuiltFilters.get(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
||||||
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
|
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||||
List<CharFilterFactory> charFilterFactoryList, boolean normalizer) throws IOException {
|
List<CharFilterFactory> charFilterFactoryList, boolean normalizer) throws IOException {
|
||||||
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||||
List<ReferringFilterFactory> referringFilters = new ArrayList<>();
|
DeferredTokenFilterRegistry deferredRegistry = new DeferredTokenFilterRegistry(analysisRegistry, indexSettings);
|
||||||
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
|
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
|
||||||
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
|
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
|
||||||
for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
|
for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
|
||||||
|
@ -594,11 +631,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
||||||
}
|
}
|
||||||
// Need to set anonymous "name" of tokenfilter
|
// Need to set anonymous "name" of tokenfilter
|
||||||
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
|
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
|
||||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
|
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
|
||||||
charFilterFactoryList, environment);
|
tokenFilterFactoryList, deferredRegistry);
|
||||||
if (tokenFilterFactory instanceof ReferringFilterFactory) {
|
|
||||||
referringFilters.add((ReferringFilterFactory)tokenFilterFactory);
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
|
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
|
||||||
|
@ -616,8 +650,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
||||||
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||||
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
|
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
|
||||||
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
|
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
|
||||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
|
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
|
||||||
charFilterFactoryList, environment);
|
tokenFilterFactoryList, deferredRegistry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (tokenFilterFactory == null) {
|
if (tokenFilterFactory == null) {
|
||||||
|
@ -633,26 +667,6 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
||||||
tokenFilterFactoryList.add(tokenFilterFactory);
|
tokenFilterFactoryList.add(tokenFilterFactory);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (referringFilters.isEmpty() == false) {
|
|
||||||
// The request included at least one custom referring tokenfilter that has not already been built by the
|
|
||||||
// analysis registry, so we need to set its references. Note that this will only apply pre-built
|
|
||||||
// tokenfilters
|
|
||||||
if (indexSettings == null) {
|
|
||||||
Settings settings = Settings.builder()
|
|
||||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
||||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
|
||||||
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
|
||||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
|
||||||
.build();
|
|
||||||
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
|
||||||
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
|
|
||||||
}
|
|
||||||
Map<String, TokenFilterFactory> prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
|
|
||||||
for (ReferringFilterFactory rff : referringFilters) {
|
|
||||||
rff.setReferences(prebuiltFilters);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
return tokenFilterFactoryList;
|
return tokenFilterFactoryList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -167,17 +167,7 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
|
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
|
||||||
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
|
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
|
||||||
|
|
||||||
Map<String, TokenFilterFactory> mappings
|
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
||||||
= buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
|
||||||
|
|
||||||
// ReferringTokenFilters require references to other tokenfilters, so we pass these in
|
|
||||||
// after all factories have been registered
|
|
||||||
for (TokenFilterFactory tff : mappings.values()) {
|
|
||||||
if (tff instanceof ReferringFilterFactory) {
|
|
||||||
((ReferringFilterFactory)tff).setReferences(mappings);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return mappings;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
|
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
|
||||||
|
|
|
@ -81,9 +81,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
||||||
if (tokenFilter == null) {
|
if (tokenFilter == null) {
|
||||||
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
|
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
|
||||||
}
|
}
|
||||||
// no need offsetGap for tokenize synonyms
|
tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(tokenizer, charFiltersList, tokenFilterList, tokenFilters::get);
|
||||||
tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
|
|
||||||
this.environment);
|
|
||||||
tokenFilterList.add(tokenFilter);
|
tokenFilterList.add(tokenFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -95,33 +93,6 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
|
|
||||||
List<TokenFilterFactory> tokenFilterList,
|
|
||||||
List<CharFilterFactory> charFiltersList, Environment env) {
|
|
||||||
if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
|
|
||||||
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
|
|
||||||
|
|
||||||
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
|
||||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
|
||||||
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
|
|
||||||
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
|
|
||||||
-1)){
|
|
||||||
tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if (tokenFilter instanceof SynonymTokenFilterFactory) {
|
|
||||||
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
|
|
||||||
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
|
||||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
|
||||||
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
|
|
||||||
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
|
|
||||||
-1)) {
|
|
||||||
tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tokenFilter;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CustomAnalyzer get() {
|
public CustomAnalyzer get() {
|
||||||
return this.customAnalyzer;
|
return this.customAnalyzer;
|
||||||
|
|
|
@ -1,37 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Marks a {@link TokenFilterFactory} that refers to other filter factories.
|
|
||||||
*
|
|
||||||
* The analysis registry will call {@link #setReferences(Map)} with a map of all
|
|
||||||
* available TokenFilterFactories after all factories have been registered
|
|
||||||
*/
|
|
||||||
public interface ReferringFilterFactory {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Called with a map of all registered filter factories
|
|
||||||
*/
|
|
||||||
void setReferences(Map<String, TokenFilterFactory> factories);
|
|
||||||
|
|
||||||
}
|
|
|
@ -28,9 +28,11 @@ import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.util.List;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
||||||
|
|
||||||
public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
||||||
String name, Settings settings) throws IOException {
|
String name, Settings settings) throws IOException {
|
||||||
super(indexSettings, env, analysisRegistry, name, settings);
|
super(indexSettings, env, analysisRegistry, name, settings);
|
||||||
|
@ -41,42 +43,24 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
||||||
throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
|
throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
|
||||||
}
|
}
|
||||||
|
|
||||||
Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
|
@Override
|
||||||
return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
|
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||||
}
|
List<TokenFilterFactory> previousTokenFilters,
|
||||||
|
Function<String, TokenFilterFactory> allFilters) {
|
||||||
public class Factory implements TokenFilterFactory{
|
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
|
||||||
|
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||||
private final String name;
|
final String name = name();
|
||||||
private final SynonymMap synonymMap;
|
return new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
|
public String name() {
|
||||||
this.name = name;
|
return name;
|
||||||
|
|
||||||
try {
|
|
||||||
SynonymMap.Builder parser;
|
|
||||||
if ("wordnet".equalsIgnoreCase(format)) {
|
|
||||||
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
|
||||||
((ESWordnetSynonymParser) parser).parse(rulesReader);
|
|
||||||
} else {
|
|
||||||
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
|
||||||
((ESSolrSynonymParser) parser).parse(rulesReader);
|
|
||||||
}
|
|
||||||
synonymMap = parser.build();
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String name() {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
return this.name;
|
return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false);
|
||||||
}
|
}
|
||||||
|
};
|
||||||
@Override
|
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
|
||||||
// fst is null means no synonyms
|
|
||||||
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, false);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,7 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
|
@ -38,6 +39,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
protected final boolean expand;
|
protected final boolean expand;
|
||||||
protected final boolean lenient;
|
protected final boolean lenient;
|
||||||
protected final Settings settings;
|
protected final Settings settings;
|
||||||
|
protected final Environment environment;
|
||||||
|
|
||||||
public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
||||||
String name, Settings settings) throws IOException {
|
String name, Settings settings) throws IOException {
|
||||||
|
@ -53,6 +55,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
this.expand = settings.getAsBoolean("expand", true);
|
this.expand = settings.getAsBoolean("expand", true);
|
||||||
this.lenient = settings.getAsBoolean("lenient", false);
|
this.lenient = settings.getAsBoolean("lenient", false);
|
||||||
this.format = settings.get("format", "");
|
this.format = settings.get("format", "");
|
||||||
|
this.environment = env;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -60,6 +63,50 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
|
throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||||
|
List<TokenFilterFactory> previousTokenFilters,
|
||||||
|
Function<String, TokenFilterFactory> allFilters) {
|
||||||
|
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
|
||||||
|
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
|
||||||
|
final String name = name();
|
||||||
|
return new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||||
|
List<TokenFilterFactory> tokenFilters) {
|
||||||
|
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
|
||||||
|
tokenFilters.stream()
|
||||||
|
.map(TokenFilterFactory::getSynonymFilter)
|
||||||
|
.toArray(TokenFilterFactory[]::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
|
||||||
|
try {
|
||||||
|
SynonymMap.Builder parser;
|
||||||
|
if ("wordnet".equalsIgnoreCase(format)) {
|
||||||
|
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
|
||||||
|
((ESWordnetSynonymParser) parser).parse(rules);
|
||||||
|
} else {
|
||||||
|
parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
|
||||||
|
((ESSolrSynonymParser) parser).parse(rules);
|
||||||
|
}
|
||||||
|
return parser.build();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected Reader getRulesFromSettings(Environment env) {
|
protected Reader getRulesFromSettings(Environment env) {
|
||||||
Reader rulesReader;
|
Reader rulesReader;
|
||||||
if (settings.getAsList("synonyms", null) != null) {
|
if (settings.getAsList("synonyms", null) != null) {
|
||||||
|
@ -77,44 +124,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
return rulesReader;
|
return rulesReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
|
|
||||||
return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
|
|
||||||
}
|
|
||||||
|
|
||||||
public class Factory implements TokenFilterFactory{
|
|
||||||
|
|
||||||
private final String name;
|
|
||||||
private final SynonymMap synonymMap;
|
|
||||||
|
|
||||||
public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
|
|
||||||
|
|
||||||
this.name = name;
|
|
||||||
|
|
||||||
try {
|
|
||||||
SynonymMap.Builder parser;
|
|
||||||
if ("wordnet".equalsIgnoreCase(format)) {
|
|
||||||
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
|
||||||
((ESWordnetSynonymParser) parser).parse(rulesReader);
|
|
||||||
} else {
|
|
||||||
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
|
|
||||||
((ESSolrSynonymParser) parser).parse(rulesReader);
|
|
||||||
}
|
|
||||||
synonymMap = parser.build();
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String name() {
|
|
||||||
return this.name;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
|
||||||
// fst is null means no synonyms
|
|
||||||
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
public interface TokenFilterFactory {
|
public interface TokenFilterFactory {
|
||||||
String name();
|
String name();
|
||||||
|
|
||||||
|
@ -36,4 +39,43 @@ public interface TokenFilterFactory {
|
||||||
default boolean breaksFastVectorHighlighter() {
|
default boolean breaksFastVectorHighlighter() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rewrite the TokenFilterFactory to take into account the preceding analysis chain, or refer
|
||||||
|
* to other TokenFilterFactories
|
||||||
|
* @param tokenizer the TokenizerFactory for the preceding chain
|
||||||
|
* @param charFilters any CharFilterFactories for the preceding chain
|
||||||
|
* @param previousTokenFilters a list of TokenFilterFactories in the preceding chain
|
||||||
|
* @param allFilters access to previously defined TokenFilterFactories
|
||||||
|
*/
|
||||||
|
default TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||||
|
List<TokenFilterFactory> previousTokenFilters,
|
||||||
|
Function<String, TokenFilterFactory> allFilters) {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a version of this TokenFilterFactory appropriate for synonym parsing
|
||||||
|
*
|
||||||
|
* Filters that should not be applied to synonyms (for example, those that produce
|
||||||
|
* multiple tokens) can return {@link #IDENTITY_FILTER}
|
||||||
|
*/
|
||||||
|
default TokenFilterFactory getSynonymFilter() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A TokenFilterFactory that does no filtering to its TokenStream
|
||||||
|
*/
|
||||||
|
TokenFilterFactory IDENTITY_FILTER = new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return "identity";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return tokenStream;
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,6 @@ import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
|
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||||
|
@ -217,6 +216,8 @@ public class CategorizationAnalyzer implements Closeable {
|
||||||
Tuple<String, TokenizerFactory> tokenizerFactory,
|
Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||||
List<CharFilterFactory> charFilterFactoryList) throws IOException {
|
List<CharFilterFactory> charFilterFactoryList) throws IOException {
|
||||||
List<CategorizationAnalyzerConfig.NameOrDefinition> tokenFilters = config.getTokenFilters();
|
List<CategorizationAnalyzerConfig.NameOrDefinition> tokenFilters = config.getTokenFilters();
|
||||||
|
TransportAnalyzeAction.DeferredTokenFilterRegistry deferredRegistry
|
||||||
|
= new TransportAnalyzeAction.DeferredTokenFilterRegistry(analysisRegistry, null);
|
||||||
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||||
for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
|
for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
|
||||||
TokenFilterFactory tokenFilterFactory;
|
TokenFilterFactory tokenFilterFactory;
|
||||||
|
@ -241,8 +242,8 @@ public class CategorizationAnalyzer implements Closeable {
|
||||||
// Need to set anonymous "name" of token_filter
|
// Need to set anonymous "name" of token_filter
|
||||||
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
|
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
|
||||||
settings);
|
settings);
|
||||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
|
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(),
|
||||||
tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
|
charFilterFactoryList, tokenFilterFactoryList, deferredRegistry);
|
||||||
}
|
}
|
||||||
if (tokenFilterFactory == null) {
|
if (tokenFilterFactory == null) {
|
||||||
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
|
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
|
||||||
|
|
Loading…
Reference in New Issue