mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-07 03:19:11 +00:00
Move more token filters to analysis-common module
The following token filters were moved: stemmer, stemmer_override, kstem, dictionary_decompounder, hyphenation_decompounder, reverse, elision and truncate. Relates to #23658
This commit is contained in:
parent
1583f81047
commit
a34f5fa812
@ -267,7 +267,6 @@
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CustomAnalyzerProvider.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ShingleTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerOverrideTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]cache[/\\]bitset[/\\]BitsetFilterCache.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]codec[/\\]PerFieldMappingPostingFormatCodec.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]engine[/\\]ElasticsearchConcurrentMergeScheduler.java" checks="LineLength" />
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis.compound;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
||||
@ -38,7 +38,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
|
||||
protected final boolean onlyLongestMatch;
|
||||
protected final CharArraySet wordList;
|
||||
|
||||
public AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
@ -55,7 +55,6 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
|
||||
@ -75,7 +74,6 @@ import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
|
||||
@ -99,7 +97,6 @@ import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
|
||||
@ -116,8 +113,6 @@ import org.elasticsearch.index.analysis.StandardAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StopAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
|
||||
@ -125,13 +120,10 @@ import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -201,23 +193,16 @@ public final class AnalysisModule {
|
||||
hunspellService) {
|
||||
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
|
||||
tokenFilters.register("stop", StopTokenFilterFactory::new);
|
||||
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
|
||||
tokenFilters.register("kstem", KStemTokenFilterFactory::new);
|
||||
tokenFilters.register("standard", StandardTokenFilterFactory::new);
|
||||
tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
|
||||
tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
|
||||
tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
|
||||
tokenFilters.register("limit", LimitTokenCountFilterFactory::new);
|
||||
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
||||
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
|
||||
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
|
||||
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
|
||||
tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
|
||||
tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
|
||||
tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
|
||||
tokenFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
|
||||
tokenFilters.register("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
|
||||
tokenFilters.register("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
|
||||
tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
|
||||
tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
|
||||
tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
|
||||
@ -225,7 +210,6 @@ public final class AnalysisModule {
|
||||
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
|
||||
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
|
||||
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
|
||||
tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
|
||||
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
||||
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
|
||||
tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);
|
||||
|
@ -47,7 +47,7 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
@ -196,18 +196,6 @@ public class AnalysisModuleTests extends ESTestCase {
|
||||
// assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
|
||||
// assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
|
||||
// assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
|
||||
//
|
||||
// // check dictionary decompounder
|
||||
// analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
|
||||
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
|
||||
// CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
|
||||
// assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
|
||||
// assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
|
||||
// assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
||||
|
||||
Set<?> wordList = Analysis.getWordSet(null, Version.CURRENT, settings, "index.analysis.filter.dict_dec.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
// MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
|
||||
}
|
||||
|
||||
public void testWordListPath() throws Exception {
|
||||
|
@ -93,16 +93,16 @@ public class AnalyzeActionIT extends ESIntegTestCase {
|
||||
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
|
||||
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));
|
||||
|
||||
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").addTokenFilter("reverse").get();
|
||||
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").get();
|
||||
assertThat(analyzeResponse.getTokens().size(), equalTo(4));
|
||||
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
|
||||
assertThat(token.getTerm(), equalTo("siht"));
|
||||
assertThat(token.getTerm(), equalTo("this"));
|
||||
token = analyzeResponse.getTokens().get(1);
|
||||
assertThat(token.getTerm(), equalTo("si"));
|
||||
assertThat(token.getTerm(), equalTo("is"));
|
||||
token = analyzeResponse.getTokens().get(2);
|
||||
assertThat(token.getTerm(), equalTo("a"));
|
||||
token = analyzeResponse.getTokens().get(3);
|
||||
assertThat(token.getTerm(), equalTo("tset"));
|
||||
assertThat(token.getTerm(), equalTo("test"));
|
||||
|
||||
analyzeResponse = client().admin().indices().prepareAnalyze("of course").setTokenizer("standard").addTokenFilter("stop").get();
|
||||
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
|
||||
|
@ -445,8 +445,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
public void testPrefixLength() throws IOException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
|
||||
.put("index.analysis.analyzer.body.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.body.filter", "lowercase")
|
||||
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
|
||||
@ -458,7 +456,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
|
||||
.startObject("properties")
|
||||
.startObject("body").field("type", "text").field("analyzer", "body").endObject()
|
||||
.startObject("body_reverse").field("type", "text").field("analyzer", "reverse").endObject()
|
||||
.startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
@ -486,8 +483,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
|
||||
.put("index.analysis.analyzer.body.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.body.filter", "lowercase")
|
||||
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
|
||||
@ -503,10 +498,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
field("type", "text").
|
||||
field("analyzer", "body")
|
||||
.endObject()
|
||||
.startObject("body_reverse").
|
||||
field("type", "text").
|
||||
field("analyzer", "reverse")
|
||||
.endObject()
|
||||
.startObject("bigram").
|
||||
field("type", "text").
|
||||
field("analyzer", "bigram")
|
||||
@ -536,7 +527,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
"Police sergeant who stops the film",
|
||||
};
|
||||
for (String line : strings) {
|
||||
index("test", "type1", line, "body", line, "body_reverse", line, "bigram", line);
|
||||
index("test", "type1", line, "body", line, "bigram", line);
|
||||
}
|
||||
refresh();
|
||||
|
||||
@ -576,14 +567,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
searchSuggest = searchSuggest( "Arthur, King of the Britons", "simple_phrase", phraseSuggest);
|
||||
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");
|
||||
|
||||
//test reverse suggestions with pre & post filter
|
||||
phraseSuggest
|
||||
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
.addCandidateGenerator(candidateGenerator("body_reverse").minWordLength(1).suggestMode("always").preFilter("reverse")
|
||||
.postFilter("reverse"));
|
||||
searchSuggest = searchSuggest( "Artur, Ging of the Britons", "simple_phrase", phraseSuggest);
|
||||
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");
|
||||
|
||||
// set all mass to trigrams (not indexed)
|
||||
phraseSuggest.clearCandidateGenerators()
|
||||
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
@ -633,8 +616,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
public void testSizeParam() throws IOException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
|
||||
.put("index.analysis.analyzer.body.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.body.filter", "lowercase")
|
||||
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
|
||||
@ -652,10 +633,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
.field("type", "text")
|
||||
.field("analyzer", "body")
|
||||
.endObject()
|
||||
.startObject("body_reverse")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "reverse")
|
||||
.endObject()
|
||||
.startObject("bigram")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "bigram")
|
||||
@ -667,9 +644,9 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
ensureGreen();
|
||||
|
||||
String line = "xorr the god jewel";
|
||||
index("test", "type1", "1", "body", line, "body_reverse", line, "bigram", line);
|
||||
index("test", "type1", "1", "body", line, "bigram", line);
|
||||
line = "I got it this time";
|
||||
index("test", "type1", "2", "body", line, "body_reverse", line, "bigram", line);
|
||||
index("test", "type1", "2", "body", line, "bigram", line);
|
||||
refresh();
|
||||
|
||||
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")
|
||||
|
@ -17,10 +17,6 @@
|
||||
},
|
||||
"my":{
|
||||
"type":"myfilter"
|
||||
},
|
||||
"dict_dec":{
|
||||
"type":"dictionary_decompounder",
|
||||
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
|
||||
}
|
||||
},
|
||||
"analyzer":{
|
||||
@ -43,10 +39,6 @@
|
||||
"czechAnalyzerWithStemmer":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["standard", "lowercase", "stop", "czech_stem"]
|
||||
},
|
||||
"decompoundingAnalyzer":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["dict_dec"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,9 +12,6 @@ index :
|
||||
stopwords : [stop2-1, stop2-2]
|
||||
my :
|
||||
type : myfilter
|
||||
dict_dec :
|
||||
type : dictionary_decompounder
|
||||
word_list : [donau, dampf, schiff, spargel, creme, suppe]
|
||||
analyzer :
|
||||
standard :
|
||||
type : standard
|
||||
@ -34,6 +31,3 @@ index :
|
||||
czechAnalyzerWithStemmer :
|
||||
tokenizer : standard
|
||||
filter : [standard, lowercase, stop, czech_stem]
|
||||
decompoundingAnalyzer :
|
||||
tokenizer : standard
|
||||
filter : [dict_dec]
|
||||
|
@ -107,6 +107,14 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||
filters.put("ngram", NGramTokenFilterFactory::new);
|
||||
filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
|
||||
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
|
||||
filters.put("stemmer", StemmerTokenFilterFactory::new);
|
||||
filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
|
||||
filters.put("kstem", KStemTokenFilterFactory::new);
|
||||
filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
|
||||
filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
|
||||
filters.put("reverse", ReverseTokenFilterFactory::new);
|
||||
filters.put("elision", ElisionTokenFilterFactory::new);
|
||||
filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis.compound;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
|
||||
@ -33,7 +33,7 @@ import org.elasticsearch.index.IndexSettings;
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
|
||||
|
||||
public DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, env, name, settings);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -25,12 +25,15 @@ import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final CharArraySet articles;
|
||||
|
||||
public ElisionTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
ElisionTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.articles = Analysis.parseArticles(env, indexSettings.getIndexVersionCreated(), settings);
|
||||
}
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis.compound;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
|
||||
@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@ -39,7 +40,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
||||
|
||||
private final HyphenationTree hyphenationTree;
|
||||
|
||||
public HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, env, name, settings);
|
||||
|
||||
String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
|
||||
@ -50,7 +51,8 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
||||
Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath);
|
||||
|
||||
try {
|
||||
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(Files.newInputStream(hyphenationPatternsFile)));
|
||||
InputStream in = Files.newInputStream(hyphenationPatternsFile);
|
||||
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in));
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
|
||||
}
|
@ -17,17 +17,18 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.en.KStemFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
|
||||
public class KStemTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
public KStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
KStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
@ -17,17 +17,18 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
|
||||
public class ReverseTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
public ReverseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
ReverseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
|
||||
@ -26,6 +26,8 @@ import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
@ -34,7 +36,7 @@ public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactor
|
||||
|
||||
private final StemmerOverrideMap overrideMap;
|
||||
|
||||
public StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
|
||||
StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
List<String> rules = Analysis.getWordList(env, settings, "rules");
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||
@ -57,6 +57,7 @@ import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.tartarus.snowball.ext.ArmenianStemmer;
|
||||
import org.tartarus.snowball.ext.BasqueStemmer;
|
||||
import org.tartarus.snowball.ext.CatalanStemmer;
|
||||
@ -86,7 +87,7 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private String language;
|
||||
|
||||
public StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter")));
|
||||
}
|
@ -17,19 +17,20 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
|
||||
public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final int length;
|
||||
|
||||
public TruncateTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
TruncateTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.length = settings.getAsInt("length", -1);
|
||||
if (length <= 0) {
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
|
||||
|
||||
import java.util.List;
|
||||
@ -67,6 +68,39 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||
filters.put("uppercase", UpperCaseTokenFilterFactory.class);
|
||||
filters.put("ngram", NGramTokenFilterFactory.class);
|
||||
filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
|
||||
filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("englishpossessive", StemmerTokenFilterFactory.class);
|
||||
filters.put("finnishlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("frenchlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("frenchminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("galicianminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("galicianstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("germanlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("germanminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("greekstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("hindistem", StemmerTokenFilterFactory.class);
|
||||
filters.put("hungarianlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("indonesianstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("italianlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("latvianstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("norwegianlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("norwegianminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("portuguesestem", StemmerTokenFilterFactory.class);
|
||||
filters.put("portugueselightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("portugueseminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("russianlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("soranistem", StemmerTokenFilterFactory.class);
|
||||
filters.put("spanishlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("swedishlightstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class);
|
||||
filters.put("kstem", KStemTokenFilterFactory.class);
|
||||
filters.put("synonym", SynonymTokenFilterFactory.class);
|
||||
filters.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class);
|
||||
filters.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class);
|
||||
filters.put("reversestring", ReverseTokenFilterFactory.class);
|
||||
filters.put("elision", ElisionTokenFilterFactory.class);
|
||||
filters.put("truncate", TruncateTokenFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -29,8 +29,9 @@ import org.elasticsearch.common.lucene.all.AllTokenStream;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
@ -40,10 +41,10 @@ import org.hamcrest.MatcherAssert;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.util.Collections.singletonList;
|
||||
import static java.util.Collections.singletonMap;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.hasItems;
|
||||
@ -53,12 +54,7 @@ public class CompoundAnalysisTests extends ESTestCase {
|
||||
public void testDefaultsCompoundAnalysis() throws Exception {
|
||||
Settings settings = getJsonSettings();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||
AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
|
||||
}
|
||||
}));
|
||||
AnalysisModule analysisModule = createAnalysisModule(settings);
|
||||
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
|
||||
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
||||
}
|
||||
@ -75,12 +71,7 @@ public class CompoundAnalysisTests extends ESTestCase {
|
||||
|
||||
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||
AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
|
||||
}
|
||||
}));
|
||||
AnalysisModule analysisModule = createAnalysisModule(settings);
|
||||
IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
|
||||
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
||||
|
||||
@ -99,8 +90,18 @@ public class CompoundAnalysisTests extends ESTestCase {
|
||||
return terms;
|
||||
}
|
||||
|
||||
private AnalysisModule createAnalysisModule(Settings settings) throws IOException {
|
||||
CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
|
||||
return new AnalysisModule(new Environment(settings), Arrays.asList(commonAnalysisPlugin, new AnalysisPlugin() {
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
private Settings getJsonSettings() throws IOException {
|
||||
String json = "/org/elasticsearch/index/analysis/test1.json";
|
||||
String json = "/org/elasticsearch/analysis/common/test1.json";
|
||||
return Settings.builder()
|
||||
.loadFromStream(json, getClass().getResourceAsStream(json))
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
@ -109,7 +110,7 @@ public class CompoundAnalysisTests extends ESTestCase {
|
||||
}
|
||||
|
||||
private Settings getYamlSettings() throws IOException {
|
||||
String yaml = "/org/elasticsearch/index/analysis/test1.yml";
|
||||
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
|
||||
return Settings.builder()
|
||||
.loadFromStream(yaml, getClass().getResourceAsStream(yaml))
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
@ -16,7 +16,7 @@
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
@ -26,6 +26,10 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
import org.elasticsearch.test.VersionUtils;
|
||||
@ -38,6 +42,9 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_C
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
|
||||
private static final CommonAnalysisPlugin PLUGIN = new CommonAnalysisPlugin();
|
||||
|
||||
public void testEnglishFilterFactory() throws IOException {
|
||||
int iters = scaledRandomIntBetween(20, 100);
|
||||
for (int i = 0; i < iters; i++) {
|
||||
@ -51,7 +58,7 @@ public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
|
||||
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
@ -79,7 +86,7 @@ public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2");
|
||||
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
@ -392,3 +392,179 @@
|
||||
- match: { tokens.1.token: foob }
|
||||
- match: { tokens.2.token: fooba }
|
||||
- match: { tokens.3.token: foobar }
|
||||
|
||||
---
|
||||
"kstem":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_kstem:
|
||||
type: kstem
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: bricks
|
||||
tokenizer: keyword
|
||||
filter: [my_kstem]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: brick }
|
||||
|
||||
# use preconfigured token filter:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: bricks
|
||||
tokenizer: keyword
|
||||
filter: [kstem]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: brick }
|
||||
|
||||
---
|
||||
"reverse":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_reverse:
|
||||
type: reverse
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: foobar
|
||||
tokenizer: keyword
|
||||
filter: [my_reverse]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: raboof }
|
||||
|
||||
# use preconfigured token filter:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: foobar
|
||||
tokenizer: keyword
|
||||
filter: [reverse]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: raboof }
|
||||
|
||||
---
|
||||
"elision":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_elision:
|
||||
type: elision
|
||||
articles: ["l", "m", "t", "qu", "n", "s", "j"]
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: "l'avion"
|
||||
tokenizer: keyword
|
||||
filter: [my_elision]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: avion }
|
||||
|
||||
---
|
||||
"stemmer":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_stemmer:
|
||||
type: stemmer
|
||||
language: dutch
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: zoeken
|
||||
tokenizer: keyword
|
||||
filter: [my_stemmer]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: zoek }
|
||||
---
|
||||
"stemmer_override":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_stemmer:
|
||||
type: stemmer
|
||||
language: dutch
|
||||
my_stemmer_override:
|
||||
type: stemmer_override
|
||||
rules: ["zoeken => override"]
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: zoeken
|
||||
tokenizer: keyword
|
||||
filter: [my_stemmer_override, my_stemmer]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: override }
|
||||
|
||||
---
|
||||
"decompounder":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_decompounder:
|
||||
type: dictionary_decompounder
|
||||
word_list: [foo, bar]
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: foobar
|
||||
tokenizer: keyword
|
||||
filter: [my_decompounder]
|
||||
- length: { tokens: 3 }
|
||||
- match: { tokens.0.token: foobar }
|
||||
- match: { tokens.1.token: foo }
|
||||
- match: { tokens.2.token: bar }
|
||||
|
||||
---
|
||||
"truncate":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_truncate:
|
||||
type: truncate
|
||||
length: 3
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: foobar
|
||||
tokenizer: keyword
|
||||
filter: [my_truncate]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: foo }
|
||||
|
@ -19,6 +19,9 @@ setup:
|
||||
ngram:
|
||||
tokenizer: standard
|
||||
filter: [lowercase, ngram]
|
||||
reverse:
|
||||
tokenizer: standard
|
||||
filter: [lowercase, reverse]
|
||||
filter:
|
||||
bigram:
|
||||
type: shingle
|
||||
@ -43,6 +46,9 @@ setup:
|
||||
ngram:
|
||||
type: text
|
||||
analyzer: ngram
|
||||
reverse:
|
||||
type: text
|
||||
analyzer: reverse
|
||||
|
||||
- do:
|
||||
bulk:
|
||||
@ -54,6 +60,40 @@ setup:
|
||||
{ "body": "Xorr the God-Jewel" }
|
||||
{ "index": {} }
|
||||
{ "body": "Xorn" }
|
||||
{ "index": {} }
|
||||
{ "body": "Arthur, King of the Britons" }
|
||||
{ "index": {} }
|
||||
{ "body": "Sir Lancelot the Brave" }
|
||||
{ "index": {} }
|
||||
{ "body": "Patsy, Arthur's Servant" }
|
||||
{ "index": {} }
|
||||
{ "body": "Sir Robin the Not-Quite-So-Brave-as-Sir-Lancelot" }
|
||||
{ "index": {} }
|
||||
{ "body": "Sir Bedevere the Wise" }
|
||||
{ "index": {} }
|
||||
{ "body": "Sir Galahad the Pure" }
|
||||
{ "index": {} }
|
||||
{ "body": "Miss Islington, the Witch" }
|
||||
{ "index": {} }
|
||||
{ "body": "Zoot" }
|
||||
{ "index": {} }
|
||||
{ "body": "Leader of Robin's Minstrels" }
|
||||
{ "index": {} }
|
||||
{ "body": "Old Crone" }
|
||||
{ "index": {} }
|
||||
{ "body": "Frank, the Historian" }
|
||||
{ "index": {} }
|
||||
{ "body": "Frank's Wife" }
|
||||
{ "index": {} }
|
||||
{ "body": "Dr. Piglet" }
|
||||
{ "index": {} }
|
||||
{ "body": "Dr. Winston" }
|
||||
{ "index": {} }
|
||||
{ "body": "Sir Robin (Stand-in)" }
|
||||
{ "index": {} }
|
||||
{ "body": "Knight Who Says Ni" }
|
||||
{ "index": {} }
|
||||
{ "body": "Police sergeant who stops the film" }
|
||||
|
||||
---
|
||||
"sorts by score":
|
||||
@ -156,3 +196,27 @@ setup:
|
||||
field: body.bigram
|
||||
analyzer: bigram
|
||||
force_unigrams: false
|
||||
|
||||
---
|
||||
"reverse suggestions":
|
||||
- do:
|
||||
search:
|
||||
size: 0
|
||||
index: test
|
||||
body:
|
||||
suggest:
|
||||
text: Artur, Ging of the Britons
|
||||
test:
|
||||
phrase:
|
||||
field: body.ngram
|
||||
force_unigrams: true
|
||||
max_errors: 0.5
|
||||
direct_generator:
|
||||
- field: body.reverse
|
||||
min_word_length: 1
|
||||
suggest_mode: always
|
||||
pre_filter: reverse
|
||||
post_filter: reverse
|
||||
|
||||
- match: {suggest.test.0.options.0.text: arthur king of the britons}
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis.filter1;
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
@ -36,13 +36,11 @@ import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
||||
@ -60,7 +58,6 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
|
||||
@ -68,17 +65,12 @@ import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
@ -147,7 +139,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||
.put("arabicstem", ArabicStemTokenFilterFactory.class)
|
||||
.put("asciifolding", MovedToAnalysisCommon.class)
|
||||
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
|
||||
.put("bulgarianstem", StemmerTokenFilterFactory.class)
|
||||
.put("bulgarianstem", MovedToAnalysisCommon.class)
|
||||
.put("cjkbigram", CJKBigramFilterFactory.class)
|
||||
.put("cjkwidth", CJKWidthFilterFactory.class)
|
||||
.put("classic", ClassicFilterFactory.class)
|
||||
@ -156,50 +148,50 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||
.put("czechstem", CzechStemTokenFilterFactory.class)
|
||||
.put("decimaldigit", DecimalDigitFilterFactory.class)
|
||||
.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class)
|
||||
.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class)
|
||||
.put("dictionarycompoundword", MovedToAnalysisCommon.class)
|
||||
.put("edgengram", MovedToAnalysisCommon.class)
|
||||
.put("elision", ElisionTokenFilterFactory.class)
|
||||
.put("englishminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("englishpossessive", StemmerTokenFilterFactory.class)
|
||||
.put("finnishlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("frenchlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("frenchminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("galicianminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("galicianstem", StemmerTokenFilterFactory.class)
|
||||
.put("elision", MovedToAnalysisCommon.class)
|
||||
.put("englishminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("englishpossessive", MovedToAnalysisCommon.class)
|
||||
.put("finnishlightstem", MovedToAnalysisCommon.class)
|
||||
.put("frenchlightstem", MovedToAnalysisCommon.class)
|
||||
.put("frenchminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("galicianminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("galicianstem", MovedToAnalysisCommon.class)
|
||||
.put("germanstem", GermanStemTokenFilterFactory.class)
|
||||
.put("germanlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("germanminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("germanlightstem", MovedToAnalysisCommon.class)
|
||||
.put("germanminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("germannormalization", GermanNormalizationFilterFactory.class)
|
||||
.put("greeklowercase", MovedToAnalysisCommon.class)
|
||||
.put("greekstem", StemmerTokenFilterFactory.class)
|
||||
.put("greekstem", MovedToAnalysisCommon.class)
|
||||
.put("hindinormalization", HindiNormalizationFilterFactory.class)
|
||||
.put("hindistem", StemmerTokenFilterFactory.class)
|
||||
.put("hungarianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("hindistem", MovedToAnalysisCommon.class)
|
||||
.put("hungarianlightstem", MovedToAnalysisCommon.class)
|
||||
.put("hunspellstem", HunspellTokenFilterFactory.class)
|
||||
.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class)
|
||||
.put("hyphenationcompoundword", MovedToAnalysisCommon.class)
|
||||
.put("indicnormalization", IndicNormalizationFilterFactory.class)
|
||||
.put("irishlowercase", MovedToAnalysisCommon.class)
|
||||
.put("indonesianstem", StemmerTokenFilterFactory.class)
|
||||
.put("italianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("indonesianstem", MovedToAnalysisCommon.class)
|
||||
.put("italianlightstem", MovedToAnalysisCommon.class)
|
||||
.put("keepword", KeepWordFilterFactory.class)
|
||||
.put("keywordmarker", MovedToAnalysisCommon.class)
|
||||
.put("kstem", KStemTokenFilterFactory.class)
|
||||
.put("latvianstem", StemmerTokenFilterFactory.class)
|
||||
.put("kstem", MovedToAnalysisCommon.class)
|
||||
.put("latvianstem", MovedToAnalysisCommon.class)
|
||||
.put("length", MovedToAnalysisCommon.class)
|
||||
.put("limittokencount", LimitTokenCountFilterFactory.class)
|
||||
.put("lowercase", MovedToAnalysisCommon.class)
|
||||
.put("ngram", MovedToAnalysisCommon.class)
|
||||
.put("norwegianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("norwegianminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("norwegianlightstem", MovedToAnalysisCommon.class)
|
||||
.put("norwegianminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class)
|
||||
.put("patternreplace", PatternReplaceTokenFilterFactory.class)
|
||||
.put("persiannormalization", PersianNormalizationFilterFactory.class)
|
||||
.put("porterstem", MovedToAnalysisCommon.class)
|
||||
.put("portuguesestem", StemmerTokenFilterFactory.class)
|
||||
.put("portugueselightstem", StemmerTokenFilterFactory.class)
|
||||
.put("portugueseminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("reversestring", ReverseTokenFilterFactory.class)
|
||||
.put("russianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("portuguesestem", MovedToAnalysisCommon.class)
|
||||
.put("portugueselightstem", MovedToAnalysisCommon.class)
|
||||
.put("portugueseminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("reversestring", MovedToAnalysisCommon.class)
|
||||
.put("russianlightstem", MovedToAnalysisCommon.class)
|
||||
.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class)
|
||||
.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class)
|
||||
.put("serbiannormalization", SerbianNormalizationFilterFactory.class)
|
||||
@ -207,16 +199,16 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||
.put("minhash", MinHashTokenFilterFactory.class)
|
||||
.put("snowballporter", MovedToAnalysisCommon.class)
|
||||
.put("soraninormalization", SoraniNormalizationFilterFactory.class)
|
||||
.put("soranistem", StemmerTokenFilterFactory.class)
|
||||
.put("spanishlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("soranistem", MovedToAnalysisCommon.class)
|
||||
.put("spanishlightstem", MovedToAnalysisCommon.class)
|
||||
.put("standard", StandardTokenFilterFactory.class)
|
||||
.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class)
|
||||
.put("stemmeroverride", MovedToAnalysisCommon.class)
|
||||
.put("stop", StopTokenFilterFactory.class)
|
||||
.put("swedishlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("swedishlightstem", MovedToAnalysisCommon.class)
|
||||
.put("synonym", SynonymTokenFilterFactory.class)
|
||||
.put("synonymgraph", SynonymGraphTokenFilterFactory.class)
|
||||
.put("trim", MovedToAnalysisCommon.class)
|
||||
.put("truncate", TruncateTokenFilterFactory.class)
|
||||
.put("truncate", MovedToAnalysisCommon.class)
|
||||
.put("turkishlowercase", MovedToAnalysisCommon.class)
|
||||
.put("type", KeepTypesFilterFactory.class)
|
||||
.put("uppercase", MovedToAnalysisCommon.class)
|
||||
|
@ -0,0 +1,54 @@
|
||||
{
|
||||
"index":{
|
||||
"analysis":{
|
||||
"tokenizer":{
|
||||
"standard":{
|
||||
"type":"standard"
|
||||
}
|
||||
},
|
||||
"filter":{
|
||||
"stop":{
|
||||
"type":"stop",
|
||||
"stopwords":["test-stop"]
|
||||
},
|
||||
"stop2":{
|
||||
"type":"stop",
|
||||
"stopwords":["stop2-1", "stop2-2"]
|
||||
},
|
||||
"my":{
|
||||
"type":"myfilter"
|
||||
},
|
||||
"dict_dec":{
|
||||
"type":"dictionary_decompounder",
|
||||
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
|
||||
}
|
||||
},
|
||||
"analyzer":{
|
||||
"standard":{
|
||||
"type":"standard",
|
||||
"stopwords":["test1", "test2", "test3"]
|
||||
},
|
||||
"custom1":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["stop", "stop2"]
|
||||
},
|
||||
"custom4":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["my"]
|
||||
},
|
||||
"custom6":{
|
||||
"tokenizer":"standard",
|
||||
"position_increment_gap": 256
|
||||
},
|
||||
"czechAnalyzerWithStemmer":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["standard", "lowercase", "stop", "czech_stem"]
|
||||
},
|
||||
"decompoundingAnalyzer":{
|
||||
"tokenizer":"standard",
|
||||
"filter":["dict_dec"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
index :
|
||||
analysis :
|
||||
tokenizer :
|
||||
standard :
|
||||
type : standard
|
||||
filter :
|
||||
stop :
|
||||
type : stop
|
||||
stopwords : [test-stop]
|
||||
stop2 :
|
||||
type : stop
|
||||
stopwords : [stop2-1, stop2-2]
|
||||
my :
|
||||
type : myfilter
|
||||
dict_dec :
|
||||
type : dictionary_decompounder
|
||||
word_list : [donau, dampf, schiff, spargel, creme, suppe]
|
||||
analyzer :
|
||||
standard :
|
||||
type : standard
|
||||
stopwords : [test1, test2, test3]
|
||||
custom1 :
|
||||
tokenizer : standard
|
||||
filter : [stop, stop2]
|
||||
custom4 :
|
||||
tokenizer : standard
|
||||
filter : [my]
|
||||
custom6 :
|
||||
tokenizer : standard
|
||||
position_increment_gap: 256
|
||||
custom7 :
|
||||
type : standard
|
||||
version: 3.6
|
||||
czechAnalyzerWithStemmer :
|
||||
tokenizer : standard
|
||||
filter : [standard, lowercase, stop, czech_stem]
|
||||
decompoundingAnalyzer :
|
||||
tokenizer : standard
|
||||
filter : [dict_dec]
|
Loading…
x
Reference in New Issue
Block a user