Allow TokenFilterFactories to rewrite themselves against their preceding chain (#33702)

We currently special-case SynonymFilterFactory and SynonymGraphFilterFactory, which need to 
know their predecessors in the analysis chain in order to correctly analyze their synonym lists. This
special-casing doesn't work with Referring filter factories, such as the Multiplexer or Conditional
filters. We also have a number of filters (eg the Multiplexer) that will break synonyms when they
appear before them in a chain, because they produce multiple tokens at the same position.

This commit adds two methods to the TokenFilterFactory interface.

* `getChainAwareTokenFilterFactory()` allows a filter factory to rewrite itself against its preceding
  filter chain, or to resolve references to other filters. It replaces `ReferringFilterFactory` and
  `CustomAnalyzerProvider.checkAndApplySynonymFilter`, and by default returns `this`.
* `getSynonymFilter()` defines whether or not a filter should be applied when building a synonym
  list `Analyzer`. By default it returns `true`.

Fixes #33609
This commit is contained in:
Alan Woodward 2018-09-19 15:52:14 +01:00 committed by GitHub
parent 4190a9f1e9
commit 5107949402
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 271 additions and 241 deletions

View File

@ -114,3 +114,11 @@ And it'd respond:
<1> The stemmer has also emitted a token `home` at position 1, but because it is a
duplicate of this token it has been removed from the token stream
NOTE: The synonym and synonym_graph filters use their preceding analysis chain to
parse and analyse their synonym lists, and ignore any token filters in the chain
that produce multiple tokens at the same position. This means that any filters
within the multiplexer will be ignored for the purpose of synonyms. If you want to
use filters contained within the multiplexer for parsing synonyms (for example, to
apply stemming to the synonym lists), then you should append the synonym filter
to the relevant multiplexer filter list.

View File

@ -29,39 +29,64 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
private List<TokenFilterFactory> filters;
private List<String> filterNames;
private final boolean preserveOriginal;
private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
@Override
public String name() {
return "identity";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return tokenStream;
}
};
public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
this.filterNames = settings.getAsList("filters");
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
}
@Override
public TokenStream create(TokenStream tokenStream) {
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
}
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
List<TokenFilterFactory> filters = new ArrayList<>();
if (preserveOriginal) {
filters.add(IDENTITY_FILTER);
}
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
filters.add(factory);
} else {
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
chain.add(factory);
existingChain.add(factory);
}
filters.add(chainFilters(filter, chain));
}
}
return new TokenFilterFactory() {
@Override
public String name() {
return MultiplexerTokenFilterFactory.this.name();
}
@Override
public TokenStream create(TokenStream tokenStream) {
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
@ -72,23 +97,10 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
}
@Override
public void setReferences(Map<String, TokenFilterFactory> factories) {
filters = new ArrayList<>();
if (preserveOriginal) {
filters.add(IDENTITY_FACTORY);
}
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
filters.add(resolveFilterFactory(factories, parts[0]));
} else {
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
chain.add(resolveFilterFactory(factories, subfilter));
}
filters.add(chainFilters(filter, chain));
}
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
}
};
}
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
@ -108,11 +120,12 @@ public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory im
};
}
private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
if (factories.containsKey(name) == false) {
private TokenFilterFactory resolveFilterFactory(Function<String, TokenFilterFactory> factories, String name) {
TokenFilterFactory factory = factories.apply(name);
if (factory == null) {
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
} else {
return factories.get(name);
return factory;
}
}

View File

@ -24,26 +24,24 @@ import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.ScriptType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
/**
* A factory for a conditional token filter that only applies child filters if the underlying token
* matches an {@link AnalysisPredicateScript}
*/
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {
public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory {
private final AnalysisPredicateScript.Factory factory;
private final List<TokenFilterFactory> filters = new ArrayList<>();
private final List<String> filterNames;
ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name,
@ -63,6 +61,34 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
throw new UnsupportedOperationException("getChainAwareTokenFilterFactory should be called first");
}
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
List<TokenFilterFactory> filters = new ArrayList<>();
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
for (String filter : filterNames) {
TokenFilterFactory tff = allFilters.apply(filter);
if (tff == null) {
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
"] refers to undefined token filter [" + filter + "]");
}
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
filters.add(tff);
existingChain.add(tff);
}
return new TokenFilterFactory() {
@Override
public String name() {
return ScriptedConditionTokenFilterFactory.this.name();
}
@Override
public TokenStream create(TokenStream tokenStream) {
Function<TokenStream, TokenStream> filter = in -> {
@ -73,6 +99,8 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
};
return new ScriptedConditionTokenFilter(tokenStream, filter, factory.newInstance());
}
};
}
private static class ScriptedConditionTokenFilter extends ConditionalTokenFilter {
@ -87,22 +115,10 @@ public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFact
}
@Override
protected boolean shouldFilter() throws IOException {
protected boolean shouldFilter() {
token.updatePosition();
return script.execute(token);
}
}
@Override
public void setReferences(Map<String, TokenFilterFactory> factories) {
for (String filter : filterNames) {
TokenFilterFactory tff = factories.get(filter);
if (tff == null) {
throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() +
"] refers to undefined token filter [" + filter + "]");
}
filters.add(tff);
}
}
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
@ -117,6 +118,26 @@ public class SynonymsAnalysisTests extends ESTestCase {
}
}
public void testSynonymsWithMultiplexer() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
.put("index.analysis.filter.my_english.type", "stemmer")
.put("index.analysis.filter.my_english.language", "porter2")
.put("index.analysis.filter.stem_repeat.type", "multiplexer")
.putList("index.analysis.filter.stem_repeat.filters", "my_english, synonyms")
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "stem_repeat")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "Some developers are odd",
new String[]{ "some", "developers", "develop", "programm", "are", "odd" },
new int[]{ 1, 1, 0, 0, 1, 1 });
}
private void match(String analyzerName, String source, String target) throws IOException {
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

View File

@ -48,11 +48,9 @@ import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
@ -66,6 +64,7 @@ import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@ -73,6 +72,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
/**
* Transport action used to execute analyze requests
@ -571,11 +571,48 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
return charFilterFactoryList;
}
public static class DeferredTokenFilterRegistry implements Function<String, TokenFilterFactory> {
private final AnalysisRegistry analysisRegistry;
private final IndexSettings indexSettings;
Map<String, TokenFilterFactory> prebuiltFilters;
public DeferredTokenFilterRegistry(AnalysisRegistry analysisRegistry, IndexSettings indexSettings) {
this.analysisRegistry = analysisRegistry;
if (indexSettings == null) {
// Settings are null when _analyze is called with no index name, so
// we create dummy settings which will make prebuilt analysis components
// available
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
.build();
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
}
this.indexSettings = indexSettings;
}
@Override
public TokenFilterFactory apply(String s) {
if (prebuiltFilters == null) {
try {
prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
return prebuiltFilters.get(s);
}
}
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
List<CharFilterFactory> charFilterFactoryList, boolean normalizer) throws IOException {
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
List<ReferringFilterFactory> referringFilters = new ArrayList<>();
DeferredTokenFilterRegistry deferredRegistry = new DeferredTokenFilterRegistry(analysisRegistry, indexSettings);
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
@ -594,11 +631,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
}
// Need to set anonymous "name" of tokenfilter
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
charFilterFactoryList, environment);
if (tokenFilterFactory instanceof ReferringFilterFactory) {
referringFilters.add((ReferringFilterFactory)tokenFilterFactory);
}
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
tokenFilterFactoryList, deferredRegistry);
} else {
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
@ -616,8 +650,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
charFilterFactoryList, environment);
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
tokenFilterFactoryList, deferredRegistry);
}
}
if (tokenFilterFactory == null) {
@ -633,26 +667,6 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
tokenFilterFactoryList.add(tokenFilterFactory);
}
}
if (referringFilters.isEmpty() == false) {
// The request included at least one custom referring tokenfilter that has not already been built by the
// analysis registry, so we need to set its references. Note that this will only apply pre-built
// tokenfilters
if (indexSettings == null) {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
.build();
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
}
Map<String, TokenFilterFactory> prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
for (ReferringFilterFactory rff : referringFilters) {
rff.setReferences(prebuiltFilters);
}
}
return tokenFilterFactoryList;
}

View File

@ -167,17 +167,7 @@ public final class AnalysisRegistry implements Closeable {
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
Map<String, TokenFilterFactory> mappings
= buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
// ReferringTokenFilters require references to other tokenfilters, so we pass these in
// after all factories have been registered
for (TokenFilterFactory tff : mappings.values()) {
if (tff instanceof ReferringFilterFactory) {
((ReferringFilterFactory)tff).setReferences(mappings);
}
}
return mappings;
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
}
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {

View File

@ -81,9 +81,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
if (tokenFilter == null) {
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
}
// no need offsetGap for tokenize synonyms
tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
this.environment);
tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(tokenizer, charFiltersList, tokenFilterList, tokenFilters::get);
tokenFilterList.add(tokenFilter);
}
@ -95,33 +93,6 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
);
}
public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
List<TokenFilterFactory> tokenFilterList,
List<CharFilterFactory> charFiltersList, Environment env) {
if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
-1)){
tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
}
} else if (tokenFilter instanceof SynonymTokenFilterFactory) {
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
-1)) {
tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
}
}
return tokenFilter;
}
@Override
public CustomAnalyzer get() {
return this.customAnalyzer;

View File

@ -1,37 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import java.util.Map;
/**
* Marks a {@link TokenFilterFactory} that refers to other filter factories.
*
* The analysis registry will call {@link #setReferences(Map)} with a map of all
* available TokenFilterFactories after all factories have been registered
*/
public interface ReferringFilterFactory {
/**
* Called with a map of all registered filter factories
*/
void setReferences(Map<String, TokenFilterFactory> factories);
}

View File

@ -28,9 +28,11 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.function.Function;
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
String name, Settings settings) throws IOException {
super(indexSettings, env, analysisRegistry, name, settings);
@ -41,42 +43,24 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
}
Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
}
public class Factory implements TokenFilterFactory{
private final String name;
private final SynonymMap synonymMap;
public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
this.name = name;
try {
SynonymMap.Builder parser;
if ("wordnet".equalsIgnoreCase(format)) {
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
((ESWordnetSynonymParser) parser).parse(rulesReader);
} else {
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
((ESSolrSynonymParser) parser).parse(rulesReader);
}
synonymMap = parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
}
}
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
final String name = name();
return new TokenFilterFactory() {
@Override
public String name() {
return this.name;
return name;
}
@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, false);
return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false);
}
};
}
}

View File

@ -31,6 +31,7 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.function.Function;
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
@ -38,6 +39,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
protected final boolean expand;
protected final boolean lenient;
protected final Settings settings;
protected final Environment environment;
public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
String name, Settings settings) throws IOException {
@ -53,6 +55,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
this.expand = settings.getAsBoolean("expand", true);
this.lenient = settings.getAsBoolean("lenient", false);
this.format = settings.get("format", "");
this.environment = env;
}
@Override
@ -60,6 +63,50 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
}
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
final String name = name();
return new TokenFilterFactory() {
@Override
public String name() {
return name;
}
@Override
public TokenStream create(TokenStream tokenStream) {
return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
}
};
}
protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> tokenFilters) {
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
tokenFilters.stream()
.map(TokenFilterFactory::getSynonymFilter)
.toArray(TokenFilterFactory[]::new));
}
protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
try {
SynonymMap.Builder parser;
if ("wordnet".equalsIgnoreCase(format)) {
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
((ESWordnetSynonymParser) parser).parse(rules);
} else {
parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
((ESSolrSynonymParser) parser).parse(rules);
}
return parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
}
}
protected Reader getRulesFromSettings(Environment env) {
Reader rulesReader;
if (settings.getAsList("synonyms", null) != null) {
@ -77,44 +124,4 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
return rulesReader;
}
Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
}
public class Factory implements TokenFilterFactory{
private final String name;
private final SynonymMap synonymMap;
public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
this.name = name;
try {
SynonymMap.Builder parser;
if ("wordnet".equalsIgnoreCase(format)) {
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
((ESWordnetSynonymParser) parser).parse(rulesReader);
} else {
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
((ESSolrSynonymParser) parser).parse(rulesReader);
}
synonymMap = parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
}
}
@Override
public String name() {
return this.name;
}
@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, false);
}
}
}

View File

@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
import java.util.List;
import java.util.function.Function;
public interface TokenFilterFactory {
String name();
@ -36,4 +39,43 @@ public interface TokenFilterFactory {
default boolean breaksFastVectorHighlighter() {
return false;
}
/**
* Rewrite the TokenFilterFactory to take into account the preceding analysis chain, or refer
* to other TokenFilterFactories
* @param tokenizer the TokenizerFactory for the preceding chain
* @param charFilters any CharFilterFactories for the preceding chain
* @param previousTokenFilters a list of TokenFilterFactories in the preceding chain
* @param allFilters access to previously defined TokenFilterFactories
*/
default TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters) {
return this;
}
/**
* Return a version of this TokenFilterFactory appropriate for synonym parsing
*
* Filters that should not be applied to synonyms (for example, those that produce
* multiple tokens) can return {@link #IDENTITY_FILTER}
*/
default TokenFilterFactory getSynonymFilter() {
return this;
}
/**
* A TokenFilterFactory that does no filtering to its TokenStream
*/
TokenFilterFactory IDENTITY_FILTER = new TokenFilterFactory() {
@Override
public String name() {
return "identity";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return tokenStream;
}
};
}

View File

@ -20,7 +20,6 @@ import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
@ -217,6 +216,8 @@ public class CategorizationAnalyzer implements Closeable {
Tuple<String, TokenizerFactory> tokenizerFactory,
List<CharFilterFactory> charFilterFactoryList) throws IOException {
List<CategorizationAnalyzerConfig.NameOrDefinition> tokenFilters = config.getTokenFilters();
TransportAnalyzeAction.DeferredTokenFilterRegistry deferredRegistry
= new TransportAnalyzeAction.DeferredTokenFilterRegistry(analysisRegistry, null);
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
TokenFilterFactory tokenFilterFactory;
@ -241,8 +242,8 @@ public class CategorizationAnalyzer implements Closeable {
// Need to set anonymous "name" of token_filter
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
settings);
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(),
charFilterFactoryList, tokenFilterFactoryList, deferredRegistry);
}
if (tokenFilterFactory == null) {
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");