Move more token filters to analysis-common module

The following token filters were moved: stemmer, stemmer_override, kstem, dictionary_decompounder, hyphenation_decompounder, reverse, elision and truncate.

Relates to #23658
This commit is contained in:
Martijn van Groningen 2017-06-23 21:22:14 +02:00
parent 1583f81047
commit a34f5fa812
No known key found for this signature in database
GPG Key ID: AB236F4FCF2AF12A
26 changed files with 475 additions and 155 deletions

View File

@ -267,7 +267,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CustomAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ShingleTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerOverrideTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]cache[/\\]bitset[/\\]BitsetFilterCache.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]codec[/\\]PerFieldMappingPostingFormatCodec.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]engine[/\\]ElasticsearchConcurrentMergeScheduler.java" checks="LineLength" />

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
@ -38,7 +38,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
protected final boolean onlyLongestMatch;
protected final CharArraySet wordList;
public AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);

View File

@ -55,7 +55,6 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
@ -75,7 +74,6 @@ import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
@ -99,7 +97,6 @@ import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
@ -116,8 +113,6 @@ import org.elasticsearch.index.analysis.StandardAnalyzerProvider;
import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
import org.elasticsearch.index.analysis.StopAnalyzerProvider;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
@ -125,13 +120,10 @@ import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import java.io.IOException;
@ -201,23 +193,16 @@ public final class AnalysisModule {
hunspellService) {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
tokenFilters.register("kstem", KStemTokenFilterFactory::new);
tokenFilters.register("standard", StandardTokenFilterFactory::new);
tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
tokenFilters.register("limit", LimitTokenCountFilterFactory::new);
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
tokenFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
tokenFilters.register("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
tokenFilters.register("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
@ -225,7 +210,6 @@ public final class AnalysisModule {
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);

View File

@ -47,7 +47,7 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
@ -196,18 +196,6 @@ public class AnalysisModuleTests extends ESTestCase {
// assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
// assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
//
// // check dictionary decompounder
// analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
// CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
// assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
Set<?> wordList = Analysis.getWordSet(null, Version.CURRENT, settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
// MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
}
public void testWordListPath() throws Exception {

View File

@ -93,16 +93,16 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").addTokenFilter("reverse").get();
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(4));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("siht"));
assertThat(token.getTerm(), equalTo("this"));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("si"));
assertThat(token.getTerm(), equalTo("is"));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("a"));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("tset"));
assertThat(token.getTerm(), equalTo("test"));
analyzeResponse = client().admin().indices().prepareAnalyze("of course").setTokenizer("standard").addTokenFilter("stop").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));

View File

@ -445,8 +445,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
public void testPrefixLength() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
@ -458,7 +456,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("body").field("type", "text").field("analyzer", "body").endObject()
.startObject("body_reverse").field("type", "text").field("analyzer", "reverse").endObject()
.startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
.endObject()
.endObject().endObject();
@ -486,8 +483,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
@ -503,10 +498,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
field("type", "text").
field("analyzer", "body")
.endObject()
.startObject("body_reverse").
field("type", "text").
field("analyzer", "reverse")
.endObject()
.startObject("bigram").
field("type", "text").
field("analyzer", "bigram")
@ -536,7 +527,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
"Police sergeant who stops the film",
};
for (String line : strings) {
index("test", "type1", line, "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", line, "body", line, "bigram", line);
}
refresh();
@ -576,14 +567,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
searchSuggest = searchSuggest( "Arthur, King of the Britons", "simple_phrase", phraseSuggest);
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");
//test reverse suggestions with pre & post filter
phraseSuggest
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
.addCandidateGenerator(candidateGenerator("body_reverse").minWordLength(1).suggestMode("always").preFilter("reverse")
.postFilter("reverse"));
searchSuggest = searchSuggest( "Artur, Ging of the Britons", "simple_phrase", phraseSuggest);
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");
// set all mass to trigrams (not indexed)
phraseSuggest.clearCandidateGenerators()
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
@ -633,8 +616,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
public void testSizeParam() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
@ -652,10 +633,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
.field("type", "text")
.field("analyzer", "body")
.endObject()
.startObject("body_reverse")
.field("type", "text")
.field("analyzer", "reverse")
.endObject()
.startObject("bigram")
.field("type", "text")
.field("analyzer", "bigram")
@ -667,9 +644,9 @@ public class SuggestSearchIT extends ESIntegTestCase {
ensureGreen();
String line = "xorr the god jewel";
index("test", "type1", "1", "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", "1", "body", line, "bigram", line);
line = "I got it this time";
index("test", "type1", "2", "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", "2", "body", line, "bigram", line);
refresh();
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")

View File

@ -17,10 +17,6 @@
},
"my":{
"type":"myfilter"
},
"dict_dec":{
"type":"dictionary_decompounder",
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
}
},
"analyzer":{
@ -43,10 +39,6 @@
"czechAnalyzerWithStemmer":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "stop", "czech_stem"]
},
"decompoundingAnalyzer":{
"tokenizer":"standard",
"filter":["dict_dec"]
}
}
}

View File

@ -12,9 +12,6 @@ index :
stopwords : [stop2-1, stop2-2]
my :
type : myfilter
dict_dec :
type : dictionary_decompounder
word_list : [donau, dampf, schiff, spargel, creme, suppe]
analyzer :
standard :
type : standard
@ -34,6 +31,3 @@ index :
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]
decompoundingAnalyzer :
tokenizer : standard
filter : [dict_dec]

View File

@ -107,6 +107,14 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("kstem", KStemTokenFilterFactory::new);
filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
filters.put("reverse", ReverseTokenFilterFactory::new);
filters.put("elision", ElisionTokenFilterFactory::new);
filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
return filters;
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
@ -33,7 +33,7 @@ import org.elasticsearch.index.IndexSettings;
*/
public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
public DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -25,12 +25,15 @@ import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
private final CharArraySet articles;
public ElisionTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
ElisionTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
this.articles = Analysis.parseArticles(env, indexSettings.getIndexVersionCreated(), settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.xml.sax.InputSource;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
@ -39,7 +40,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
private final HyphenationTree hyphenationTree;
public HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
@ -50,7 +51,8 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath);
try {
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(Files.newInputStream(hyphenationPatternsFile)));
InputStream in = Files.newInputStream(hyphenationPatternsFile);
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in));
} catch (Exception e) {
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
}

View File

@ -17,17 +17,18 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.KStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class KStemTokenFilterFactory extends AbstractTokenFilterFactory {
public KStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
KStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,17 +17,18 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class ReverseTokenFilterFactory extends AbstractTokenFilterFactory {
public ReverseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ReverseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
@ -26,6 +26,8 @@ import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.io.IOException;
import java.util.List;
@ -34,7 +36,7 @@ public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactor
private final StemmerOverrideMap overrideMap;
public StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
List<String> rules = Analysis.getWordList(env, settings, "rules");

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
@ -57,6 +57,7 @@ import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.tartarus.snowball.ext.ArmenianStemmer;
import org.tartarus.snowball.ext.BasqueStemmer;
import org.tartarus.snowball.ext.CatalanStemmer;
@ -86,7 +87,7 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
private String language;
public StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter")));
}

View File

@ -17,19 +17,20 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory {
private final int length;
public TruncateTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
TruncateTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.length = settings.getAsInt("length", -1);
if (length <= 0) {

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
import java.util.List;
@ -67,6 +68,39 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
filters.put("uppercase", UpperCaseTokenFilterFactory.class);
filters.put("ngram", NGramTokenFilterFactory.class);
filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
filters.put("englishpossessive", StemmerTokenFilterFactory.class);
filters.put("finnishlightstem", StemmerTokenFilterFactory.class);
filters.put("frenchlightstem", StemmerTokenFilterFactory.class);
filters.put("frenchminimalstem", StemmerTokenFilterFactory.class);
filters.put("galicianminimalstem", StemmerTokenFilterFactory.class);
filters.put("galicianstem", StemmerTokenFilterFactory.class);
filters.put("germanlightstem", StemmerTokenFilterFactory.class);
filters.put("germanminimalstem", StemmerTokenFilterFactory.class);
filters.put("greekstem", StemmerTokenFilterFactory.class);
filters.put("hindistem", StemmerTokenFilterFactory.class);
filters.put("hungarianlightstem", StemmerTokenFilterFactory.class);
filters.put("indonesianstem", StemmerTokenFilterFactory.class);
filters.put("italianlightstem", StemmerTokenFilterFactory.class);
filters.put("latvianstem", StemmerTokenFilterFactory.class);
filters.put("norwegianlightstem", StemmerTokenFilterFactory.class);
filters.put("norwegianminimalstem", StemmerTokenFilterFactory.class);
filters.put("portuguesestem", StemmerTokenFilterFactory.class);
filters.put("portugueselightstem", StemmerTokenFilterFactory.class);
filters.put("portugueseminimalstem", StemmerTokenFilterFactory.class);
filters.put("russianlightstem", StemmerTokenFilterFactory.class);
filters.put("soranistem", StemmerTokenFilterFactory.class);
filters.put("spanishlightstem", StemmerTokenFilterFactory.class);
filters.put("swedishlightstem", StemmerTokenFilterFactory.class);
filters.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class);
filters.put("kstem", KStemTokenFilterFactory.class);
filters.put("synonym", SynonymTokenFilterFactory.class);
filters.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class);
filters.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class);
filters.put("reversestring", ReverseTokenFilterFactory.class);
filters.put("elision", ElisionTokenFilterFactory.class);
filters.put("truncate", TruncateTokenFilterFactory.class);
return filters;
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@ -29,8 +29,9 @@ import org.elasticsearch.common.lucene.all.AllTokenStream;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
@ -40,10 +41,10 @@ import org.hamcrest.MatcherAssert;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItems;
@ -53,12 +54,7 @@ public class CompoundAnalysisTests extends ESTestCase {
public void testDefaultsCompoundAnalysis() throws Exception {
Settings settings = getJsonSettings();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
}
}));
AnalysisModule analysisModule = createAnalysisModule(settings);
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
}
@ -75,12 +71,7 @@ public class CompoundAnalysisTests extends ESTestCase {
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
}
}));
AnalysisModule analysisModule = createAnalysisModule(settings);
IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
@ -99,8 +90,18 @@ public class CompoundAnalysisTests extends ESTestCase {
return terms;
}
private AnalysisModule createAnalysisModule(Settings settings) throws IOException {
CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
return new AnalysisModule(new Environment(settings), Arrays.asList(commonAnalysisPlugin, new AnalysisPlugin() {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("myfilter", MyFilterTokenFilterFactory::new);
}
}));
}
private Settings getJsonSettings() throws IOException {
String json = "/org/elasticsearch/index/analysis/test1.json";
String json = "/org/elasticsearch/analysis/common/test1.json";
return Settings.builder()
.loadFromStream(json, getClass().getResourceAsStream(json))
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
@ -109,7 +110,7 @@ public class CompoundAnalysisTests extends ESTestCase {
}
private Settings getYamlSettings() throws IOException {
String yaml = "/org/elasticsearch/index/analysis/test1.yml";
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
return Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml))
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)

View File

@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -26,6 +26,10 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.VersionUtils;
@ -38,6 +42,9 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_C
import static org.hamcrest.Matchers.instanceOf;
public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase {
private static final CommonAnalysisPlugin PLUGIN = new CommonAnalysisPlugin();
public void testEnglishFilterFactory() throws IOException {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {
@ -51,7 +58,7 @@ public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
@ -79,7 +86,7 @@ public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();

View File

@ -392,3 +392,179 @@
- match: { tokens.1.token: foob }
- match: { tokens.2.token: fooba }
- match: { tokens.3.token: foobar }
---
"kstem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_kstem:
type: kstem
- do:
indices.analyze:
index: test
body:
text: bricks
tokenizer: keyword
filter: [my_kstem]
- length: { tokens: 1 }
- match: { tokens.0.token: brick }
# use preconfigured token filter:
- do:
indices.analyze:
body:
text: bricks
tokenizer: keyword
filter: [kstem]
- length: { tokens: 1 }
- match: { tokens.0.token: brick }
---
"reverse":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_reverse:
type: reverse
- do:
indices.analyze:
index: test
body:
text: foobar
tokenizer: keyword
filter: [my_reverse]
- length: { tokens: 1 }
- match: { tokens.0.token: raboof }
# use preconfigured token filter:
- do:
indices.analyze:
body:
text: foobar
tokenizer: keyword
filter: [reverse]
- length: { tokens: 1 }
- match: { tokens.0.token: raboof }
---
"elision":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_elision:
type: elision
articles: ["l", "m", "t", "qu", "n", "s", "j"]
- do:
indices.analyze:
index: test
body:
text: "l'avion"
tokenizer: keyword
filter: [my_elision]
- length: { tokens: 1 }
- match: { tokens.0.token: avion }
---
"stemmer":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_stemmer:
type: stemmer
language: dutch
- do:
indices.analyze:
index: test
body:
text: zoeken
tokenizer: keyword
filter: [my_stemmer]
- length: { tokens: 1 }
- match: { tokens.0.token: zoek }
---
"stemmer_override":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_stemmer:
type: stemmer
language: dutch
my_stemmer_override:
type: stemmer_override
rules: ["zoeken => override"]
- do:
indices.analyze:
index: test
body:
text: zoeken
tokenizer: keyword
filter: [my_stemmer_override, my_stemmer]
- length: { tokens: 1 }
- match: { tokens.0.token: override }
---
"decompounder":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_decompounder:
type: dictionary_decompounder
word_list: [foo, bar]
- do:
indices.analyze:
index: test
body:
text: foobar
tokenizer: keyword
filter: [my_decompounder]
- length: { tokens: 3 }
- match: { tokens.0.token: foobar }
- match: { tokens.1.token: foo }
- match: { tokens.2.token: bar }
---
"truncate":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_truncate:
type: truncate
length: 3
- do:
indices.analyze:
index: test
body:
text: foobar
tokenizer: keyword
filter: [my_truncate]
- length: { tokens: 1 }
- match: { tokens.0.token: foo }

View File

@ -19,6 +19,9 @@ setup:
ngram:
tokenizer: standard
filter: [lowercase, ngram]
reverse:
tokenizer: standard
filter: [lowercase, reverse]
filter:
bigram:
type: shingle
@ -43,6 +46,9 @@ setup:
ngram:
type: text
analyzer: ngram
reverse:
type: text
analyzer: reverse
- do:
bulk:
@ -54,6 +60,40 @@ setup:
{ "body": "Xorr the God-Jewel" }
{ "index": {} }
{ "body": "Xorn" }
{ "index": {} }
{ "body": "Arthur, King of the Britons" }
{ "index": {} }
{ "body": "Sir Lancelot the Brave" }
{ "index": {} }
{ "body": "Patsy, Arthur's Servant" }
{ "index": {} }
{ "body": "Sir Robin the Not-Quite-So-Brave-as-Sir-Lancelot" }
{ "index": {} }
{ "body": "Sir Bedevere the Wise" }
{ "index": {} }
{ "body": "Sir Galahad the Pure" }
{ "index": {} }
{ "body": "Miss Islington, the Witch" }
{ "index": {} }
{ "body": "Zoot" }
{ "index": {} }
{ "body": "Leader of Robin's Minstrels" }
{ "index": {} }
{ "body": "Old Crone" }
{ "index": {} }
{ "body": "Frank, the Historian" }
{ "index": {} }
{ "body": "Frank's Wife" }
{ "index": {} }
{ "body": "Dr. Piglet" }
{ "index": {} }
{ "body": "Dr. Winston" }
{ "index": {} }
{ "body": "Sir Robin (Stand-in)" }
{ "index": {} }
{ "body": "Knight Who Says Ni" }
{ "index": {} }
{ "body": "Police sergeant who stops the film" }
---
"sorts by score":
@ -156,3 +196,27 @@ setup:
field: body.bigram
analyzer: bigram
force_unigrams: false
---
"reverse suggestions":
- do:
search:
size: 0
index: test
body:
suggest:
text: Artur, Ging of the Britons
test:
phrase:
field: body.ngram
force_unigrams: true
max_errors: 0.5
direct_generator:
- field: body.reverse
min_word_length: 1
suggest_mode: always
pre_filter: reverse
post_filter: reverse
- match: {suggest.test.0.options.0.text: arthur king of the britons}

View File

@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.filter1;
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;

View File

@ -36,13 +36,11 @@ import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
@ -60,7 +58,6 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
@ -68,17 +65,12 @@ import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
@ -147,7 +139,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("arabicstem", ArabicStemTokenFilterFactory.class)
.put("asciifolding", MovedToAnalysisCommon.class)
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
.put("bulgarianstem", StemmerTokenFilterFactory.class)
.put("bulgarianstem", MovedToAnalysisCommon.class)
.put("cjkbigram", CJKBigramFilterFactory.class)
.put("cjkwidth", CJKWidthFilterFactory.class)
.put("classic", ClassicFilterFactory.class)
@ -156,50 +148,50 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("czechstem", CzechStemTokenFilterFactory.class)
.put("decimaldigit", DecimalDigitFilterFactory.class)
.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class)
.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class)
.put("dictionarycompoundword", MovedToAnalysisCommon.class)
.put("edgengram", MovedToAnalysisCommon.class)
.put("elision", ElisionTokenFilterFactory.class)
.put("englishminimalstem", StemmerTokenFilterFactory.class)
.put("englishpossessive", StemmerTokenFilterFactory.class)
.put("finnishlightstem", StemmerTokenFilterFactory.class)
.put("frenchlightstem", StemmerTokenFilterFactory.class)
.put("frenchminimalstem", StemmerTokenFilterFactory.class)
.put("galicianminimalstem", StemmerTokenFilterFactory.class)
.put("galicianstem", StemmerTokenFilterFactory.class)
.put("elision", MovedToAnalysisCommon.class)
.put("englishminimalstem", MovedToAnalysisCommon.class)
.put("englishpossessive", MovedToAnalysisCommon.class)
.put("finnishlightstem", MovedToAnalysisCommon.class)
.put("frenchlightstem", MovedToAnalysisCommon.class)
.put("frenchminimalstem", MovedToAnalysisCommon.class)
.put("galicianminimalstem", MovedToAnalysisCommon.class)
.put("galicianstem", MovedToAnalysisCommon.class)
.put("germanstem", GermanStemTokenFilterFactory.class)
.put("germanlightstem", StemmerTokenFilterFactory.class)
.put("germanminimalstem", StemmerTokenFilterFactory.class)
.put("germanlightstem", MovedToAnalysisCommon.class)
.put("germanminimalstem", MovedToAnalysisCommon.class)
.put("germannormalization", GermanNormalizationFilterFactory.class)
.put("greeklowercase", MovedToAnalysisCommon.class)
.put("greekstem", StemmerTokenFilterFactory.class)
.put("greekstem", MovedToAnalysisCommon.class)
.put("hindinormalization", HindiNormalizationFilterFactory.class)
.put("hindistem", StemmerTokenFilterFactory.class)
.put("hungarianlightstem", StemmerTokenFilterFactory.class)
.put("hindistem", MovedToAnalysisCommon.class)
.put("hungarianlightstem", MovedToAnalysisCommon.class)
.put("hunspellstem", HunspellTokenFilterFactory.class)
.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class)
.put("hyphenationcompoundword", MovedToAnalysisCommon.class)
.put("indicnormalization", IndicNormalizationFilterFactory.class)
.put("irishlowercase", MovedToAnalysisCommon.class)
.put("indonesianstem", StemmerTokenFilterFactory.class)
.put("italianlightstem", StemmerTokenFilterFactory.class)
.put("indonesianstem", MovedToAnalysisCommon.class)
.put("italianlightstem", MovedToAnalysisCommon.class)
.put("keepword", KeepWordFilterFactory.class)
.put("keywordmarker", MovedToAnalysisCommon.class)
.put("kstem", KStemTokenFilterFactory.class)
.put("latvianstem", StemmerTokenFilterFactory.class)
.put("kstem", MovedToAnalysisCommon.class)
.put("latvianstem", MovedToAnalysisCommon.class)
.put("length", MovedToAnalysisCommon.class)
.put("limittokencount", LimitTokenCountFilterFactory.class)
.put("lowercase", MovedToAnalysisCommon.class)
.put("ngram", MovedToAnalysisCommon.class)
.put("norwegianlightstem", StemmerTokenFilterFactory.class)
.put("norwegianminimalstem", StemmerTokenFilterFactory.class)
.put("norwegianlightstem", MovedToAnalysisCommon.class)
.put("norwegianminimalstem", MovedToAnalysisCommon.class)
.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class)
.put("patternreplace", PatternReplaceTokenFilterFactory.class)
.put("persiannormalization", PersianNormalizationFilterFactory.class)
.put("porterstem", MovedToAnalysisCommon.class)
.put("portuguesestem", StemmerTokenFilterFactory.class)
.put("portugueselightstem", StemmerTokenFilterFactory.class)
.put("portugueseminimalstem", StemmerTokenFilterFactory.class)
.put("reversestring", ReverseTokenFilterFactory.class)
.put("russianlightstem", StemmerTokenFilterFactory.class)
.put("portuguesestem", MovedToAnalysisCommon.class)
.put("portugueselightstem", MovedToAnalysisCommon.class)
.put("portugueseminimalstem", MovedToAnalysisCommon.class)
.put("reversestring", MovedToAnalysisCommon.class)
.put("russianlightstem", MovedToAnalysisCommon.class)
.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class)
.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class)
.put("serbiannormalization", SerbianNormalizationFilterFactory.class)
@ -207,16 +199,16 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("minhash", MinHashTokenFilterFactory.class)
.put("snowballporter", MovedToAnalysisCommon.class)
.put("soraninormalization", SoraniNormalizationFilterFactory.class)
.put("soranistem", StemmerTokenFilterFactory.class)
.put("spanishlightstem", StemmerTokenFilterFactory.class)
.put("soranistem", MovedToAnalysisCommon.class)
.put("spanishlightstem", MovedToAnalysisCommon.class)
.put("standard", StandardTokenFilterFactory.class)
.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class)
.put("stemmeroverride", MovedToAnalysisCommon.class)
.put("stop", StopTokenFilterFactory.class)
.put("swedishlightstem", StemmerTokenFilterFactory.class)
.put("swedishlightstem", MovedToAnalysisCommon.class)
.put("synonym", SynonymTokenFilterFactory.class)
.put("synonymgraph", SynonymGraphTokenFilterFactory.class)
.put("trim", MovedToAnalysisCommon.class)
.put("truncate", TruncateTokenFilterFactory.class)
.put("truncate", MovedToAnalysisCommon.class)
.put("turkishlowercase", MovedToAnalysisCommon.class)
.put("type", KeepTypesFilterFactory.class)
.put("uppercase", MovedToAnalysisCommon.class)

View File

@ -0,0 +1,54 @@
{
"index":{
"analysis":{
"tokenizer":{
"standard":{
"type":"standard"
}
},
"filter":{
"stop":{
"type":"stop",
"stopwords":["test-stop"]
},
"stop2":{
"type":"stop",
"stopwords":["stop2-1", "stop2-2"]
},
"my":{
"type":"myfilter"
},
"dict_dec":{
"type":"dictionary_decompounder",
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
}
},
"analyzer":{
"standard":{
"type":"standard",
"stopwords":["test1", "test2", "test3"]
},
"custom1":{
"tokenizer":"standard",
"filter":["stop", "stop2"]
},
"custom4":{
"tokenizer":"standard",
"filter":["my"]
},
"custom6":{
"tokenizer":"standard",
"position_increment_gap": 256
},
"czechAnalyzerWithStemmer":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "stop", "czech_stem"]
},
"decompoundingAnalyzer":{
"tokenizer":"standard",
"filter":["dict_dec"]
}
}
}
}
}

View File

@ -0,0 +1,39 @@
index :
analysis :
tokenizer :
standard :
type : standard
filter :
stop :
type : stop
stopwords : [test-stop]
stop2 :
type : stop
stopwords : [stop2-1, stop2-2]
my :
type : myfilter
dict_dec :
type : dictionary_decompounder
word_list : [donau, dampf, schiff, spargel, creme, suppe]
analyzer :
standard :
type : standard
stopwords : [test1, test2, test3]
custom1 :
tokenizer : standard
filter : [stop, stop2]
custom4 :
tokenizer : standard
filter : [my]
custom6 :
tokenizer : standard
position_increment_gap: 256
custom7 :
type : standard
version: 3.6
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]
decompoundingAnalyzer :
tokenizer : standard
filter : [dict_dec]