Parse synonyms with the same analysis chain (#8049)

* [Analysis] Parse synonyms with the same analysis chain

Synonym Token Filter / Synonym Graph Filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain.

Close #7199
This commit is contained in:
Jun Ohtani 2017-06-20 21:50:33 +09:00 committed by GitHub
parent 3261586cac
commit 62d1969595
13 changed files with 424 additions and 92 deletions

View File

@ -49,6 +49,7 @@ import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
@ -183,13 +184,14 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(request, indexAnalyzers,
analysisRegistry, environment);
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
tokenFilterFactories = getTokenFilterFactories(request, indexSettings, analysisRegistry, environment, tokenFilterFactories);
List<CharFilterFactory> charFilterFactoryList = parseCharFilterFactories(request, indexSettings, analysisRegistry, environment);
CharFilterFactory[] charFilterFactories = new CharFilterFactory[0];
charFilterFactories = getCharFilterFactories(request, indexSettings, analysisRegistry, environment, charFilterFactories);
List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(request, indexSettings, analysisRegistry,
environment, tokenizerFactory, charFilterFactoryList);
analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), charFilterFactories, tokenFilterFactories);
analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()]));
closeAnalyzer = true;
} else if (analyzer == null) {
if (indexAnalyzers == null) {
@ -462,12 +464,13 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
return extendedAttributes;
}
private static CharFilterFactory[] getCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment, CharFilterFactory[] charFilterFactories) throws IOException {
private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment) throws IOException {
List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
if (request.charFilters() != null && request.charFilters().size() > 0) {
charFilterFactories = new CharFilterFactory[request.charFilters().size()];
for (int i = 0; i < request.charFilters().size(); i++) {
final AnalyzeRequest.NameOrDefinition charFilter = request.charFilters().get(i);
List<AnalyzeRequest.NameOrDefinition> charFilters = request.charFilters();
for (AnalyzeRequest.NameOrDefinition charFilter : charFilters) {
CharFilterFactory charFilterFactory;
// parse anonymous settings
if (charFilter.definition != null) {
Settings settings = getAnonymousSettings(charFilter.definition);
@ -481,7 +484,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
throw new IllegalArgumentException("failed to find global char filter under [" + charFilterTypeName + "]");
}
// Need to set anonymous "name" of char_filter
charFilterFactories[i] = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter_[" + i + "]", settings);
charFilterFactory = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter", settings);
} else {
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory;
if (indexSettings == null) {
@ -489,31 +492,34 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
if (charFilterFactoryFactory == null) {
throw new IllegalArgumentException("failed to find global char filter under [" + charFilter.name + "]");
}
charFilterFactories[i] = charFilterFactoryFactory.get(environment, charFilter.name);
charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
} else {
charFilterFactoryFactory = analysisRegistry.getCharFilterProvider(charFilter.name, indexSettings);
if (charFilterFactoryFactory == null) {
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
}
charFilterFactories[i] = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
charFilterFactory = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
AnalysisRegistry.INDEX_ANALYSIS_CHAR_FILTER + "." + charFilter.name));
}
}
if (charFilterFactories[i] == null) {
if (charFilterFactory == null) {
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
}
charFilterFactoryList.add(charFilterFactory);
}
}
return charFilterFactories;
return charFilterFactoryList;
}
private static TokenFilterFactory[] getTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment, TokenFilterFactory[] tokenFilterFactories) throws IOException {
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
List<CharFilterFactory> charFilterFactoryList) throws IOException {
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().size()];
for (int i = 0; i < request.tokenFilters().size(); i++) {
final AnalyzeRequest.NameOrDefinition tokenFilter = request.tokenFilters().get(i);
List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
TokenFilterFactory tokenFilterFactory;
// parse anonymous settings
if (tokenFilter.definition != null) {
Settings settings = getAnonymousSettings(tokenFilter.definition);
@ -527,7 +533,11 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
throw new IllegalArgumentException("failed to find global token filter under [" + filterTypeName + "]");
}
// Need to set anonymous "name" of tokenfilter
tokenFilterFactories[i] = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter_[" + i + "]", settings);
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
charFilterFactoryList, environment);
} else {
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
if (indexSettings == null) {
@ -535,23 +545,26 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
if (tokenFilterFactoryFactory == null) {
throw new IllegalArgumentException("failed to find global token filter under [" + tokenFilter.name + "]");
}
tokenFilterFactories[i] = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
} else {
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name, indexSettings);
if (tokenFilterFactoryFactory == null) {
if (tokenFilterFactoryFactory == null) {
throw new IllegalArgumentException("failed to find token filter under [" + tokenFilter.name + "]");
}
tokenFilterFactories[i] = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name,
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name));
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
charFilterFactoryList, environment);
}
}
if (tokenFilterFactories[i] == null) {
if (tokenFilterFactory == null) {
throw new IllegalArgumentException("failed to find or create token filter under [" + tokenFilter.name + "]");
}
tokenFilterFactoryList.add(tokenFilterFactory);
}
}
return tokenFilterFactories;
return tokenFilterFactoryList;
}
private static Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalyzeRequest request, IndexAnalyzers indexAnalzyers,

View File

@ -318,12 +318,12 @@ public final class AnalysisRegistry implements Closeable {
T factory = null;
if (typeName == null) {
if (currentSettings.get("tokenizer") != null) {
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment);
} else {
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
} else if (typeName.equals("custom")) {
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment);
}
if (factory != null) {
factories.put(name, factory);

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.analysis;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.TextFieldMapper;
@ -34,13 +35,15 @@ import java.util.Map;
public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
private final Settings analyzerSettings;
private final Environment environment;
private CustomAnalyzer customAnalyzer;
public CustomAnalyzerProvider(IndexSettings indexSettings,
String name, Settings settings) {
String name, Settings settings, Environment environment) {
super(indexSettings, name, settings);
this.analyzerSettings = settings;
this.environment = environment;
}
public void build(final Map<String, TokenizerFactory> tokenizers, final Map<String, CharFilterFactory> charFilters,
@ -65,6 +68,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
charFiltersList.add(charFilter);
}
int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
String[] tokenFilterNames = analyzerSettings.getAsArray("filter");
List<TokenFilterFactory> tokenFilterList = new ArrayList<>(tokenFilterNames.length);
for (String tokenFilterName : tokenFilterNames) {
@ -72,14 +81,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
if (tokenFilter == null) {
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
}
// no need offsetGap for tokenize synonyms
tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
this.environment);
tokenFilterList.add(tokenFilter);
}
int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);;
this.customAnalyzer = new CustomAnalyzer(tokenizerName, tokenizer,
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()]),
@ -88,6 +95,33 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
);
}
public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
List<TokenFilterFactory> tokenFilterList,
List<CharFilterFactory> charFiltersList, Environment env) {
if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
-1)){
tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
}
} else if (tokenFilter instanceof SynonymTokenFilterFactory) {
List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
-1)) {
tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
}
}
return tokenFilter;
}
@Override
public CustomAnalyzer get() {
return this.customAnalyzer;

View File

@ -19,13 +19,19 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import java.io.IOException;
import java.io.Reader;
public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
@ -35,7 +41,45 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase);
throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
}
Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
}
public class Factory implements TokenFilterFactory{
private final String name;
private final SynonymMap synonymMap;
public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
this.name = name;
try {
SynonymMap.Builder parser;
if ("wordnet".equalsIgnoreCase(format)) {
parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym);
((WordnetSynonymParser) parser).parse(rulesReader);
} else {
parser = new SolrSynonymParser(true, expand, analyzerForParseSynonym);
((SolrSynonymParser) parser).parse(rulesReader);
}
synonymMap = parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
}
}
@Override
public String name() {
return this.name;
}
@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase);
}
}
}

View File

@ -23,35 +23,80 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.elasticsearch.Version;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
import java.util.List;
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
protected final SynonymMap synonymMap;
/**
* @deprecated this property only works with tokenizer property
*/
@Deprecated
protected final boolean ignoreCase;
protected final String format;
protected final boolean expand;
protected final Settings settings;
public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
this.settings = settings;
Reader rulesReader = null;
this.ignoreCase =
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha3) && settings.get("ignore_case") != null) {
deprecationLogger.deprecated(
"This tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. " +
"If you need ignore case with this filter, you should set lowercase filter before this");
}
this.expand =
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger);
// for backward compatibility
if (indexSettings.getIndexVersionCreated().before(Version.V_6_0_0_alpha3)) {
String tokenizerName = settings.get("tokenizer", "whitespace");
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
}
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
this.tokenizerFactory = tokenizerFactory;
} else {
this.tokenizerFactory = null;
}
this.format = settings.get("format", "");
}
@Override
public TokenStream create(TokenStream tokenStream) {
throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
}
protected Reader getRulesFromSettings(Environment env) {
Reader rulesReader;
if (settings.getAsArray("synonyms", null) != null) {
List<String> rules = Analysis.getWordList(env, settings, "synonyms");
List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
StringBuilder sb = new StringBuilder();
for (String line : rules) {
for (String line : rulesList) {
sb.append(line).append(System.lineSeparator());
}
rulesReader = new FastStringReader(sb.toString());
@ -60,49 +105,72 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
} else {
throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
}
return rulesReader;
}
this.ignoreCase =
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
boolean expand =
settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger);
Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
}
String tokenizerName = settings.get("tokenizer", "whitespace");
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
if (tokenizerFactoryFactory == null) {
throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
}
final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
// for backward compatibility
/**
* @deprecated This filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain in 6.0.
*/
@Deprecated
protected final TokenizerFactory tokenizerFactory;
try {
SynonymMap.Builder parser = null;
public class Factory implements TokenFilterFactory{
if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
parser = new WordnetSynonymParser(true, expand, analyzer);
((WordnetSynonymParser) parser).parse(rulesReader);
private final String name;
private final SynonymMap synonymMap;
public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
this.name = name;
Analyzer analyzer;
if (tokenizerFactory != null) {
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = tokenizerFactory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
} else {
parser = new SolrSynonymParser(true, expand, analyzer);
((SolrSynonymParser) parser).parse(rulesReader);
analyzer = analyzerForParseSynonym;
}
synonymMap = parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
try {
SynonymMap.Builder parser;
if ("wordnet".equalsIgnoreCase(format)) {
parser = new WordnetSynonymParser(true, expand, analyzer);
((WordnetSynonymParser) parser).parse(rulesReader);
} else {
parser = new SolrSynonymParser(true, expand, analyzer);
((SolrSynonymParser) parser).parse(rulesReader);
}
synonymMap = parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
} finally {
if (tokenizerFactory != null) {
analyzer.close();
}
}
}
@Override
public String name() {
return this.name;
}
@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
// fst is null means no synonyms
return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
}
}

View File

@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.classic.ParseException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.logging.Loggers;
@ -41,6 +42,8 @@ import java.nio.file.Files;
import java.nio.file.Path;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.startsWith;
public class SynonymsAnalysisTests extends ESTestCase {
protected final Logger logger = Loggers.getLogger(getClass());
@ -69,8 +72,57 @@ public class SynonymsAnalysisTests extends ESTestCase {
match("synonymAnalyzerWordnet", "abstain", "abstain refrain desist");
match("synonymAnalyzerWordnet_file", "abstain", "abstain refrain desist");
match("synonymAnalyzerWithsettings", "kimchy", "sha hay");
match("synonymAnalyzerWithStopAfterSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,");
match("synonymAnalyzerWithStopBeforeSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,");
match("synonymAnalyzerWithStopSynonymAfterSynonym", "kimchy is the dude abides", "shay is the man!");
match("synonymAnalyzerExpand", "kimchy is the dude abides", "kimchy shay is the dude elasticsearch abides man!");
match("synonymAnalyzerExpandWithStopAfterSynonym", "kimchy is the dude abides", "shay is the dude abides man!");
}
public void testSynonymWordDeleteByAnalyzer() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonym.type", "synonym")
.putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay", "dude => elasticsearch", "abides => man!")
.put("index.analysis.filter.stop_within_synonym.type", "stop")
.putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch")
.put("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.filter", "stop_within_synonym","synonym")
.put().build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try {
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
fail("fail! due to synonym word deleted by analyzer");
} catch (Exception e) {
assertThat(e, instanceOf(IllegalArgumentException.class));
assertThat(e.getMessage(), startsWith("failed to build synonyms"));
}
}
public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonym_expand.type", "synonym")
.putArray("index.analysis.filter.synonym_expand.synonyms", "kimchy, shay", "dude, elasticsearch", "abides, man!")
.put("index.analysis.filter.stop_within_synonym.type", "stop")
.putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch")
.put("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.filter", "stop_within_synonym","synonym_expand")
.put().build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try {
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
fail("fail! due to synonym word deleted by analyzer");
} catch (Exception e) {
assertThat(e, instanceOf(IllegalArgumentException.class));
assertThat(e.getMessage(), startsWith("failed to build synonyms"));
}
}
private void match(String analyzerName, String source, String target) throws IOException {
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

View File

@ -383,7 +383,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPositionLength(), equalTo(1));
// tokenfilter({"type": "stop", "stopwords": ["foo", "buzz"]})
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter_[1]"));
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter"));
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens().length, equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getTerm(), equalTo("test"));

View File

@ -3,11 +3,11 @@
"analysis":{
"analyzer":{
"synonymAnalyzer":{
"tokenizer":"standard",
"tokenizer":"whitespace",
"filter":[ "synonym" ]
},
"synonymAnalyzer_file":{
"tokenizer":"standard",
"tokenizer":"whitespace",
"filter":[ "synonym_file" ]
},
"synonymAnalyzerWordnet":{
@ -21,6 +21,26 @@
"synonymAnalyzerWithsettings":{
"tokenizer":"trigram",
"filter":["synonymWithTokenizerSettings"]
},
"synonymAnalyzerWithStopBeforeSynonym": {
"tokenizer":"whitespace",
"filter":["stop","synonym"]
},
"synonymAnalyzerWithStopAfterSynonym":{
"tokenizer":"whitespace",
"filter":["synonym","stop"]
},
"synonymAnalyzerWithStopSynonymAfterSynonym":{
"tokenizer":"whitespace",
"filter":["synonym","stop_within_synonym"]
},
"synonymAnalyzerExpand":{
"tokenizer": "whitespace",
"filter":["synonym_expand"]
},
"synonymAnalyzerExpandWithStopAfterSynonym":{
"tokenizer": "whitespace",
"filter":["synonym_expand", "stop_within_synonym"]
}
},
"tokenizer":{
@ -61,10 +81,23 @@
"type":"synonym",
"synonyms":[
"kimchy => shay"
],
"tokenizer" : "trigram",
"min_gram" : 3,
"max_gram" : 3
]
},
"stop":{
"type": "stop",
"stopwords":["stop","synonym"]
},
"stop_within_synonym":{
"type": "stop",
"stopwords":["kimchy", "elasticsearch"]
},
"synonym_expand":{
"type":"synonym",
"synonyms":[
"kimchy , shay",
"dude , elasticsearch",
"abides , man!"
]
}
}
}

View File

@ -50,11 +50,14 @@ PUT /test_index
The above configures a `search_synonyms` filter, with a path of
`analysis/synonym.txt` (relative to the `config` location). The
`search_synonyms` analyzer is then configured with the filter.
Additional settings are: `ignore_case` (defaults to `false`), and
`expand` (defaults to `true`).
Additional settings are: `expand` (defaults to `true`).
[float]
==== `tokenizer` and `ignore_case` are deprecated
The `tokenizer` parameter controls the tokenizers that will be used to
tokenize the synonym, and defaults to the `whitespace` tokenizer.
tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0..
The `ignore_case` parameter works with `tokenizer` parameter only.
Two synonym formats are supported: Solr, WordNet.

View File

@ -34,11 +34,17 @@ PUT /test_index
The above configures a `synonym` filter, with a path of
`analysis/synonym.txt` (relative to the `config` location). The
`synonym` analyzer is then configured with the filter. Additional
settings are: `ignore_case` (defaults to `false`), and `expand`
(defaults to `true`).
settings is: `expand` (defaults to `true`).
This filter tokenize synonyms with whatever tokenizer and token filters
appear before it in the chain.
[float]
==== `tokenizer` and `ignore_case` are deprecated
The `tokenizer` parameter controls the tokenizers that will be used to
tokenize the synonym, and defaults to the `whitespace` tokenizer.
tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0..
The `ignore_case` parameter works with `tokenizer` parameter only.
Two synonym formats are supported: Solr, WordNet.

View File

@ -29,3 +29,14 @@ now disallowed for these indices' mappings.
Previously Elasticsearch would silently ignore any dynamic templates that
included a `match_mapping_type` type that was unrecognized. An exception is now
thrown on an unrecognized type.
==== Synonym Token Filter
In 6.0, Synonym Token Filter tokenize synonyms with whatever
tokenizer and token filters appear before it in the chain.
`tokenizer` and `ignore_case` are deprecated.
These parameters are still left for backwards compatibility
for indices that created before 6.0.
And elasticsearch ignores these properties for new indices.

View File

@ -0,0 +1,35 @@
"Synonym filter with char_filter":
# Tests analyze with synonym and char_filter. This is in the analysis-common module
# because there are no char filters in core.
- skip:
version: " - 5.99.99"
reason: to support synonym same analysis chain were added in 6.0.0
- do:
indices.create:
index: test_synonym_with_charfilter
body:
settings:
index:
analysis:
analyzer:
synonymAnalyzerWithCharfilter:
tokenizer: whitespace
char_filter: ["html_strip"]
filter: ["synonym"]
filter:
synonym:
type: synonym
synonyms: ["<p>kimchy</p> => shay", "dude => <html>elasticsearch</html>", "<font>abides</font> => man!"]
- do:
indices.analyze:
index: test_synonym_with_charfilter
body:
analyzer: "synonymAnalyzerWithCharfilter"
text: "kimchy is the dude <html>abides</html>"
- length: { tokens: 5 }
- match: { tokens.0.token: shay }
- match: { tokens.1.token: is }
- match: { tokens.2.token: the }
- match: { tokens.3.token: elasticsearch }
- match: { tokens.4.token: man! }

View File

@ -73,5 +73,38 @@
- match: { detail.tokenizer.tokens.0.token: foo }
- match: { detail.tokenizer.tokens.1.token: bar }
- match: { detail.tokenizer.tokens.2.token: buzz }
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
- match: { detail.tokenfilters.0.tokens.0.token: bar }
---
"Synonym filter with tokenizer":
- skip:
version: " - 5.99.99"
reason: to support synonym same analysis chain were added in 6.0.0
- do:
indices.create:
index: test_synonym
body:
settings:
index:
analysis:
tokenizer:
trigram:
type: nGram
min_gram: 3
max_gram: 3
filter:
synonym:
type: synonym
synonyms: ["kimchy => shay"]
- do:
indices.analyze:
index: test_synonym
body:
tokenizer: trigram
filter: [synonym]
text: kimchy
- length: { tokens: 2 }
- match: { tokens.0.token: sha }
- match: { tokens.1.token: hay }