Parse synonyms with the same analysis chain (#8049)
* [Analysis] Parse synonyms with the same analysis chain Synonym Token Filter / Synonym Graph Filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. Close #7199
This commit is contained in:
parent
3261586cac
commit
62d1969595
|
@ -49,6 +49,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
@ -183,13 +184,14 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(request, indexAnalyzers,
|
||||
analysisRegistry, environment);
|
||||
|
||||
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
|
||||
tokenFilterFactories = getTokenFilterFactories(request, indexSettings, analysisRegistry, environment, tokenFilterFactories);
|
||||
List<CharFilterFactory> charFilterFactoryList = parseCharFilterFactories(request, indexSettings, analysisRegistry, environment);
|
||||
|
||||
CharFilterFactory[] charFilterFactories = new CharFilterFactory[0];
|
||||
charFilterFactories = getCharFilterFactories(request, indexSettings, analysisRegistry, environment, charFilterFactories);
|
||||
List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(request, indexSettings, analysisRegistry,
|
||||
environment, tokenizerFactory, charFilterFactoryList);
|
||||
|
||||
analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), charFilterFactories, tokenFilterFactories);
|
||||
analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
|
||||
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
|
||||
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()]));
|
||||
closeAnalyzer = true;
|
||||
} else if (analyzer == null) {
|
||||
if (indexAnalyzers == null) {
|
||||
|
@ -462,12 +464,13 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
return extendedAttributes;
|
||||
}
|
||||
|
||||
private static CharFilterFactory[] getCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
||||
Environment environment, CharFilterFactory[] charFilterFactories) throws IOException {
|
||||
private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
||||
Environment environment) throws IOException {
|
||||
List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
|
||||
if (request.charFilters() != null && request.charFilters().size() > 0) {
|
||||
charFilterFactories = new CharFilterFactory[request.charFilters().size()];
|
||||
for (int i = 0; i < request.charFilters().size(); i++) {
|
||||
final AnalyzeRequest.NameOrDefinition charFilter = request.charFilters().get(i);
|
||||
List<AnalyzeRequest.NameOrDefinition> charFilters = request.charFilters();
|
||||
for (AnalyzeRequest.NameOrDefinition charFilter : charFilters) {
|
||||
CharFilterFactory charFilterFactory;
|
||||
// parse anonymous settings
|
||||
if (charFilter.definition != null) {
|
||||
Settings settings = getAnonymousSettings(charFilter.definition);
|
||||
|
@ -481,7 +484,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
throw new IllegalArgumentException("failed to find global char filter under [" + charFilterTypeName + "]");
|
||||
}
|
||||
// Need to set anonymous "name" of char_filter
|
||||
charFilterFactories[i] = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter_[" + i + "]", settings);
|
||||
charFilterFactory = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter", settings);
|
||||
} else {
|
||||
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory;
|
||||
if (indexSettings == null) {
|
||||
|
@ -489,31 +492,34 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
charFilterFactories[i] = charFilterFactoryFactory.get(environment, charFilter.name);
|
||||
charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
|
||||
} else {
|
||||
charFilterFactoryFactory = analysisRegistry.getCharFilterProvider(charFilter.name, indexSettings);
|
||||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
charFilterFactories[i] = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
|
||||
charFilterFactory = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
|
||||
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_CHAR_FILTER + "." + charFilter.name));
|
||||
}
|
||||
}
|
||||
if (charFilterFactories[i] == null) {
|
||||
if (charFilterFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
charFilterFactoryList.add(charFilterFactory);
|
||||
}
|
||||
}
|
||||
return charFilterFactories;
|
||||
return charFilterFactoryList;
|
||||
}
|
||||
|
||||
private static TokenFilterFactory[] getTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
||||
Environment environment, TokenFilterFactory[] tokenFilterFactories) throws IOException {
|
||||
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
|
||||
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||
List<CharFilterFactory> charFilterFactoryList) throws IOException {
|
||||
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
|
||||
tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().size()];
|
||||
for (int i = 0; i < request.tokenFilters().size(); i++) {
|
||||
final AnalyzeRequest.NameOrDefinition tokenFilter = request.tokenFilters().get(i);
|
||||
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
|
||||
for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
|
||||
TokenFilterFactory tokenFilterFactory;
|
||||
// parse anonymous settings
|
||||
if (tokenFilter.definition != null) {
|
||||
Settings settings = getAnonymousSettings(tokenFilter.definition);
|
||||
|
@ -527,7 +533,11 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
throw new IllegalArgumentException("failed to find global token filter under [" + filterTypeName + "]");
|
||||
}
|
||||
// Need to set anonymous "name" of tokenfilter
|
||||
tokenFilterFactories[i] = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter_[" + i + "]", settings);
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
|
||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
|
||||
charFilterFactoryList, environment);
|
||||
|
||||
|
||||
} else {
|
||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
|
||||
if (indexSettings == null) {
|
||||
|
@ -535,23 +545,26 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
tokenFilterFactories[i] = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
|
||||
} else {
|
||||
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name, indexSettings);
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
tokenFilterFactories[i] = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name,
|
||||
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name));
|
||||
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
|
||||
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
|
||||
charFilterFactoryList, environment);
|
||||
}
|
||||
}
|
||||
if (tokenFilterFactories[i] == null) {
|
||||
if (tokenFilterFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find or create token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
tokenFilterFactoryList.add(tokenFilterFactory);
|
||||
}
|
||||
}
|
||||
return tokenFilterFactories;
|
||||
return tokenFilterFactoryList;
|
||||
}
|
||||
|
||||
private static Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalyzeRequest request, IndexAnalyzers indexAnalzyers,
|
||||
|
|
|
@ -318,12 +318,12 @@ public final class AnalysisRegistry implements Closeable {
|
|||
T factory = null;
|
||||
if (typeName == null) {
|
||||
if (currentSettings.get("tokenizer") != null) {
|
||||
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
|
||||
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment);
|
||||
} else {
|
||||
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
|
||||
}
|
||||
} else if (typeName.equals("custom")) {
|
||||
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
|
||||
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment);
|
||||
}
|
||||
if (factory != null) {
|
||||
factories.put(name, factory);
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.mapper.TextFieldMapper;
|
||||
|
||||
|
@ -34,13 +35,15 @@ import java.util.Map;
|
|||
public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
|
||||
|
||||
private final Settings analyzerSettings;
|
||||
private final Environment environment;
|
||||
|
||||
private CustomAnalyzer customAnalyzer;
|
||||
|
||||
public CustomAnalyzerProvider(IndexSettings indexSettings,
|
||||
String name, Settings settings) {
|
||||
String name, Settings settings, Environment environment) {
|
||||
super(indexSettings, name, settings);
|
||||
this.analyzerSettings = settings;
|
||||
this.environment = environment;
|
||||
}
|
||||
|
||||
public void build(final Map<String, TokenizerFactory> tokenizers, final Map<String, CharFilterFactory> charFilters,
|
||||
|
@ -65,6 +68,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
|||
charFiltersList.add(charFilter);
|
||||
}
|
||||
|
||||
int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
|
||||
|
||||
positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
|
||||
|
||||
int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
|
||||
|
||||
String[] tokenFilterNames = analyzerSettings.getAsArray("filter");
|
||||
List<TokenFilterFactory> tokenFilterList = new ArrayList<>(tokenFilterNames.length);
|
||||
for (String tokenFilterName : tokenFilterNames) {
|
||||
|
@ -72,14 +81,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
|||
if (tokenFilter == null) {
|
||||
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
|
||||
}
|
||||
// no need offsetGap for tokenize synonyms
|
||||
tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
|
||||
this.environment);
|
||||
tokenFilterList.add(tokenFilter);
|
||||
}
|
||||
|
||||
int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
|
||||
|
||||
positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
|
||||
|
||||
int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);;
|
||||
this.customAnalyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()]),
|
||||
|
@ -88,6 +95,33 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
|||
);
|
||||
}
|
||||
|
||||
public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
|
||||
List<TokenFilterFactory> tokenFilterList,
|
||||
List<CharFilterFactory> charFiltersList, Environment env) {
|
||||
if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
|
||||
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
|
||||
|
||||
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
|
||||
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
|
||||
-1)){
|
||||
tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
|
||||
}
|
||||
|
||||
} else if (tokenFilter instanceof SynonymTokenFilterFactory) {
|
||||
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
|
||||
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
|
||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
|
||||
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
|
||||
-1)) {
|
||||
tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
|
||||
}
|
||||
}
|
||||
return tokenFilter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CustomAnalyzer get() {
|
||||
return this.customAnalyzer;
|
||||
|
|
|
@ -19,13 +19,19 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
|
||||
import org.elasticsearch.common.io.FastStringReader;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
||||
public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
||||
|
@ -35,7 +41,45 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase);
|
||||
throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
|
||||
}
|
||||
|
||||
Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
|
||||
return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
|
||||
}
|
||||
|
||||
public class Factory implements TokenFilterFactory{
|
||||
|
||||
private final String name;
|
||||
private final SynonymMap synonymMap;
|
||||
|
||||
public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
|
||||
this.name = name;
|
||||
|
||||
try {
|
||||
SynonymMap.Builder parser;
|
||||
if ("wordnet".equalsIgnoreCase(format)) {
|
||||
parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym);
|
||||
((WordnetSynonymParser) parser).parse(rulesReader);
|
||||
} else {
|
||||
parser = new SolrSynonymParser(true, expand, analyzerForParseSynonym);
|
||||
((SolrSynonymParser) parser).parse(rulesReader);
|
||||
}
|
||||
synonymMap = parser.build();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,35 +23,80 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.io.FastStringReader;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.file.Files;
|
||||
import java.util.List;
|
||||
|
||||
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
protected final SynonymMap synonymMap;
|
||||
/**
|
||||
* @deprecated this property only works with tokenizer property
|
||||
*/
|
||||
@Deprecated
|
||||
protected final boolean ignoreCase;
|
||||
protected final String format;
|
||||
protected final boolean expand;
|
||||
protected final Settings settings;
|
||||
|
||||
public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
|
||||
String name, Settings settings) throws IOException {
|
||||
super(indexSettings, name, settings);
|
||||
this.settings = settings;
|
||||
|
||||
Reader rulesReader = null;
|
||||
this.ignoreCase =
|
||||
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha3) && settings.get("ignore_case") != null) {
|
||||
deprecationLogger.deprecated(
|
||||
"This tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. " +
|
||||
"If you need ignore case with this filter, you should set lowercase filter before this");
|
||||
}
|
||||
|
||||
this.expand =
|
||||
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger);
|
||||
|
||||
// for backward compatibility
|
||||
if (indexSettings.getIndexVersionCreated().before(Version.V_6_0_0_alpha3)) {
|
||||
String tokenizerName = settings.get("tokenizer", "whitespace");
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
|
||||
analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
|
||||
}
|
||||
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
|
||||
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
|
||||
this.tokenizerFactory = tokenizerFactory;
|
||||
} else {
|
||||
this.tokenizerFactory = null;
|
||||
}
|
||||
|
||||
this.format = settings.get("format", "");
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
|
||||
}
|
||||
|
||||
protected Reader getRulesFromSettings(Environment env) {
|
||||
Reader rulesReader;
|
||||
if (settings.getAsArray("synonyms", null) != null) {
|
||||
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
|
||||
List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String line : rules) {
|
||||
for (String line : rulesList) {
|
||||
sb.append(line).append(System.lineSeparator());
|
||||
}
|
||||
rulesReader = new FastStringReader(sb.toString());
|
||||
|
@ -60,49 +105,72 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
} else {
|
||||
throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
|
||||
}
|
||||
return rulesReader;
|
||||
}
|
||||
|
||||
this.ignoreCase =
|
||||
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
|
||||
boolean expand =
|
||||
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger);
|
||||
Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
|
||||
return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
|
||||
}
|
||||
|
||||
String tokenizerName = settings.get("tokenizer", "whitespace");
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
|
||||
analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
|
||||
}
|
||||
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
|
||||
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
|
||||
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
// for backward compatibility
|
||||
/**
|
||||
* @deprecated This filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain in 6.0.
|
||||
*/
|
||||
@Deprecated
|
||||
protected final TokenizerFactory tokenizerFactory;
|
||||
|
||||
try {
|
||||
SynonymMap.Builder parser = null;
|
||||
public class Factory implements TokenFilterFactory{
|
||||
|
||||
if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
|
||||
parser = new WordnetSynonymParser(true, expand, analyzer);
|
||||
((WordnetSynonymParser) parser).parse(rulesReader);
|
||||
private final String name;
|
||||
private final SynonymMap synonymMap;
|
||||
|
||||
public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
|
||||
|
||||
this.name = name;
|
||||
|
||||
Analyzer analyzer;
|
||||
if (tokenizerFactory != null) {
|
||||
analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = tokenizerFactory.create();
|
||||
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
} else {
|
||||
parser = new SolrSynonymParser(true, expand, analyzer);
|
||||
((SolrSynonymParser) parser).parse(rulesReader);
|
||||
analyzer = analyzerForParseSynonym;
|
||||
}
|
||||
|
||||
synonymMap = parser.build();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||
try {
|
||||
SynonymMap.Builder parser;
|
||||
if ("wordnet".equalsIgnoreCase(format)) {
|
||||
parser = new WordnetSynonymParser(true, expand, analyzer);
|
||||
((WordnetSynonymParser) parser).parse(rulesReader);
|
||||
} else {
|
||||
parser = new SolrSynonymParser(true, expand, analyzer);
|
||||
((SolrSynonymParser) parser).parse(rulesReader);
|
||||
}
|
||||
synonymMap = parser.build();
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("failed to build synonyms", e);
|
||||
} finally {
|
||||
if (tokenizerFactory != null) {
|
||||
analyzer.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
// fst is null means no synonyms
|
||||
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
|
@ -41,6 +42,8 @@ import java.nio.file.Files;
|
|||
import java.nio.file.Path;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
public class SynonymsAnalysisTests extends ESTestCase {
|
||||
protected final Logger logger = Loggers.getLogger(getClass());
|
||||
|
@ -69,8 +72,57 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
|||
match("synonymAnalyzerWordnet", "abstain", "abstain refrain desist");
|
||||
match("synonymAnalyzerWordnet_file", "abstain", "abstain refrain desist");
|
||||
match("synonymAnalyzerWithsettings", "kimchy", "sha hay");
|
||||
match("synonymAnalyzerWithStopAfterSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,");
|
||||
match("synonymAnalyzerWithStopBeforeSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,");
|
||||
match("synonymAnalyzerWithStopSynonymAfterSynonym", "kimchy is the dude abides", "shay is the man!");
|
||||
match("synonymAnalyzerExpand", "kimchy is the dude abides", "kimchy shay is the dude elasticsearch abides man!");
|
||||
match("synonymAnalyzerExpandWithStopAfterSynonym", "kimchy is the dude abides", "shay is the dude abides man!");
|
||||
|
||||
}
|
||||
|
||||
public void testSynonymWordDeleteByAnalyzer() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonym.type", "synonym")
|
||||
.putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay", "dude => elasticsearch", "abides => man!")
|
||||
.put("index.analysis.filter.stop_within_synonym.type", "stop")
|
||||
.putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch")
|
||||
.put("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.filter", "stop_within_synonym","synonym")
|
||||
.put().build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try {
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
|
||||
fail("fail! due to synonym word deleted by analyzer");
|
||||
} catch (Exception e) {
|
||||
assertThat(e, instanceOf(IllegalArgumentException.class));
|
||||
assertThat(e.getMessage(), startsWith("failed to build synonyms"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("path.home", createTempDir().toString())
|
||||
.put("index.analysis.filter.synonym_expand.type", "synonym")
|
||||
.putArray("index.analysis.filter.synonym_expand.synonyms", "kimchy, shay", "dude, elasticsearch", "abides, man!")
|
||||
.put("index.analysis.filter.stop_within_synonym.type", "stop")
|
||||
.putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch")
|
||||
.put("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.filter", "stop_within_synonym","synonym_expand")
|
||||
.put().build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try {
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
|
||||
fail("fail! due to synonym word deleted by analyzer");
|
||||
} catch (Exception e) {
|
||||
assertThat(e, instanceOf(IllegalArgumentException.class));
|
||||
assertThat(e.getMessage(), startsWith("failed to build synonyms"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void match(String analyzerName, String source, String target) throws IOException {
|
||||
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
||||
|
||||
|
|
|
@ -383,7 +383,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
|
|||
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPositionLength(), equalTo(1));
|
||||
|
||||
// tokenfilter({"type": "stop", "stopwords": ["foo", "buzz"]})
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter_[1]"));
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter"));
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens().length, equalTo(1));
|
||||
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getTerm(), equalTo("test"));
|
||||
|
|
|
@ -3,11 +3,11 @@
|
|||
"analysis":{
|
||||
"analyzer":{
|
||||
"synonymAnalyzer":{
|
||||
"tokenizer":"standard",
|
||||
"tokenizer":"whitespace",
|
||||
"filter":[ "synonym" ]
|
||||
},
|
||||
"synonymAnalyzer_file":{
|
||||
"tokenizer":"standard",
|
||||
"tokenizer":"whitespace",
|
||||
"filter":[ "synonym_file" ]
|
||||
},
|
||||
"synonymAnalyzerWordnet":{
|
||||
|
@ -21,6 +21,26 @@
|
|||
"synonymAnalyzerWithsettings":{
|
||||
"tokenizer":"trigram",
|
||||
"filter":["synonymWithTokenizerSettings"]
|
||||
},
|
||||
"synonymAnalyzerWithStopBeforeSynonym": {
|
||||
"tokenizer":"whitespace",
|
||||
"filter":["stop","synonym"]
|
||||
},
|
||||
"synonymAnalyzerWithStopAfterSynonym":{
|
||||
"tokenizer":"whitespace",
|
||||
"filter":["synonym","stop"]
|
||||
},
|
||||
"synonymAnalyzerWithStopSynonymAfterSynonym":{
|
||||
"tokenizer":"whitespace",
|
||||
"filter":["synonym","stop_within_synonym"]
|
||||
},
|
||||
"synonymAnalyzerExpand":{
|
||||
"tokenizer": "whitespace",
|
||||
"filter":["synonym_expand"]
|
||||
},
|
||||
"synonymAnalyzerExpandWithStopAfterSynonym":{
|
||||
"tokenizer": "whitespace",
|
||||
"filter":["synonym_expand", "stop_within_synonym"]
|
||||
}
|
||||
},
|
||||
"tokenizer":{
|
||||
|
@ -61,10 +81,23 @@
|
|||
"type":"synonym",
|
||||
"synonyms":[
|
||||
"kimchy => shay"
|
||||
],
|
||||
"tokenizer" : "trigram",
|
||||
"min_gram" : 3,
|
||||
"max_gram" : 3
|
||||
]
|
||||
},
|
||||
"stop":{
|
||||
"type": "stop",
|
||||
"stopwords":["stop","synonym"]
|
||||
},
|
||||
"stop_within_synonym":{
|
||||
"type": "stop",
|
||||
"stopwords":["kimchy", "elasticsearch"]
|
||||
},
|
||||
"synonym_expand":{
|
||||
"type":"synonym",
|
||||
"synonyms":[
|
||||
"kimchy , shay",
|
||||
"dude , elasticsearch",
|
||||
"abides , man!"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,11 +50,14 @@ PUT /test_index
|
|||
The above configures a `search_synonyms` filter, with a path of
|
||||
`analysis/synonym.txt` (relative to the `config` location). The
|
||||
`search_synonyms` analyzer is then configured with the filter.
|
||||
Additional settings are: `ignore_case` (defaults to `false`), and
|
||||
`expand` (defaults to `true`).
|
||||
Additional settings are: `expand` (defaults to `true`).
|
||||
|
||||
[float]
|
||||
==== `tokenizer` and `ignore_case` are deprecated
|
||||
|
||||
The `tokenizer` parameter controls the tokenizers that will be used to
|
||||
tokenize the synonym, and defaults to the `whitespace` tokenizer.
|
||||
tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0..
|
||||
The `ignore_case` parameter works with `tokenizer` parameter only.
|
||||
|
||||
Two synonym formats are supported: Solr, WordNet.
|
||||
|
||||
|
|
|
@ -34,11 +34,17 @@ PUT /test_index
|
|||
The above configures a `synonym` filter, with a path of
|
||||
`analysis/synonym.txt` (relative to the `config` location). The
|
||||
`synonym` analyzer is then configured with the filter. Additional
|
||||
settings are: `ignore_case` (defaults to `false`), and `expand`
|
||||
(defaults to `true`).
|
||||
settings is: `expand` (defaults to `true`).
|
||||
|
||||
This filter tokenize synonyms with whatever tokenizer and token filters
|
||||
appear before it in the chain.
|
||||
|
||||
[float]
|
||||
==== `tokenizer` and `ignore_case` are deprecated
|
||||
|
||||
The `tokenizer` parameter controls the tokenizers that will be used to
|
||||
tokenize the synonym, and defaults to the `whitespace` tokenizer.
|
||||
tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0..
|
||||
The `ignore_case` parameter works with `tokenizer` parameter only.
|
||||
|
||||
Two synonym formats are supported: Solr, WordNet.
|
||||
|
||||
|
|
|
@ -29,3 +29,14 @@ now disallowed for these indices' mappings.
|
|||
Previously Elasticsearch would silently ignore any dynamic templates that
|
||||
included a `match_mapping_type` type that was unrecognized. An exception is now
|
||||
thrown on an unrecognized type.
|
||||
|
||||
==== Synonym Token Filter
|
||||
|
||||
In 6.0, Synonym Token Filter tokenize synonyms with whatever
|
||||
tokenizer and token filters appear before it in the chain.
|
||||
|
||||
`tokenizer` and `ignore_case` are deprecated.
|
||||
These parameters are still left for backwards compatibility
|
||||
for indices that created before 6.0.
|
||||
And elasticsearch ignores these properties for new indices.
|
||||
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
"Synonym filter with char_filter":
|
||||
# Tests analyze with synonym and char_filter. This is in the analysis-common module
|
||||
# because there are no char filters in core.
|
||||
- skip:
|
||||
version: " - 5.99.99"
|
||||
reason: to support synonym same analysis chain were added in 6.0.0
|
||||
- do:
|
||||
indices.create:
|
||||
index: test_synonym_with_charfilter
|
||||
body:
|
||||
settings:
|
||||
index:
|
||||
analysis:
|
||||
analyzer:
|
||||
synonymAnalyzerWithCharfilter:
|
||||
tokenizer: whitespace
|
||||
char_filter: ["html_strip"]
|
||||
filter: ["synonym"]
|
||||
filter:
|
||||
synonym:
|
||||
type: synonym
|
||||
synonyms: ["<p>kimchy</p> => shay", "dude => <html>elasticsearch</html>", "<font>abides</font> => man!"]
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test_synonym_with_charfilter
|
||||
body:
|
||||
analyzer: "synonymAnalyzerWithCharfilter"
|
||||
text: "kimchy is the dude <html>abides</html>"
|
||||
- length: { tokens: 5 }
|
||||
- match: { tokens.0.token: shay }
|
||||
- match: { tokens.1.token: is }
|
||||
- match: { tokens.2.token: the }
|
||||
- match: { tokens.3.token: elasticsearch }
|
||||
- match: { tokens.4.token: man! }
|
|
@ -73,5 +73,38 @@
|
|||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: bar }
|
||||
- match: { detail.tokenizer.tokens.2.token: buzz }
|
||||
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
|
||||
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: bar }
|
||||
|
||||
---
|
||||
"Synonym filter with tokenizer":
|
||||
- skip:
|
||||
version: " - 5.99.99"
|
||||
reason: to support synonym same analysis chain were added in 6.0.0
|
||||
- do:
|
||||
indices.create:
|
||||
index: test_synonym
|
||||
body:
|
||||
settings:
|
||||
index:
|
||||
analysis:
|
||||
tokenizer:
|
||||
trigram:
|
||||
type: nGram
|
||||
min_gram: 3
|
||||
max_gram: 3
|
||||
filter:
|
||||
synonym:
|
||||
type: synonym
|
||||
synonyms: ["kimchy => shay"]
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test_synonym
|
||||
body:
|
||||
tokenizer: trigram
|
||||
filter: [synonym]
|
||||
text: kimchy
|
||||
- length: { tokens: 2 }
|
||||
- match: { tokens.0.token: sha }
|
||||
- match: { tokens.1.token: hay }
|
||||
|
|
Loading…
Reference in New Issue