Moved `keyword_marker`, `trim`, `snowball` and `porter_stemmer` tokenfilter factories from core to common-analysis module.

Relates to #23658
This commit is contained in:
Martijn van Groningen 2017-05-30 00:54:46 +02:00
parent a089dc9dcd
commit 258be2b135
No known key found for this signature in database
GPG Key ID: AB236F4FCF2AF12A
11 changed files with 101 additions and 54 deletions

View File

@ -81,7 +81,6 @@ import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordMarkerTokenFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
import org.elasticsearch.index.analysis.LengthTokenFilterFactory;
@ -101,7 +100,6 @@ import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory;
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
@ -115,7 +113,6 @@ import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
import org.elasticsearch.index.analysis.SnowballTokenFilterFactory;
import org.elasticsearch.index.analysis.SoraniAnalyzerProvider;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SpanishAnalyzerProvider;
@ -132,7 +129,6 @@ import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.TrimTokenFilterFactory;
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
@ -212,7 +208,6 @@ public final class AnalysisModule {
tokenFilters.register("length", LengthTokenFilterFactory::new);
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
tokenFilters.register("porter_stem", PorterStemTokenFilterFactory::new);
tokenFilters.register("kstem", KStemTokenFilterFactory::new);
tokenFilters.register("standard", StandardTokenFilterFactory::new);
tokenFilters.register("nGram", NGramTokenFilterFactory::new);
@ -223,10 +218,8 @@ public final class AnalysisModule {
tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
tokenFilters.register("unique", UniqueTokenFilterFactory::new);
tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
tokenFilters.register("trim", TrimTokenFilterFactory::new);
tokenFilters.register("limit", LimitTokenCountFilterFactory::new);
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
@ -244,7 +237,6 @@ public final class AnalysisModule {
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new));
tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);

View File

@ -275,38 +275,17 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens().length, equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getTerm(), equalTo("\nthis is a test\n"));
//check other attributes
analyzeResponse = client().admin().indices().prepareAnalyze("This is troubled")
.setExplain(true).setTokenizer("standard").addTokenFilter("snowball").get();
.setExplain(true).setTokenizer("standard").addTokenFilter("lowercase").get();
assertThat(analyzeResponse.detail().tokenfilters().length, equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[0].getName(), equalTo("snowball"));
assertThat(analyzeResponse.detail().tokenfilters()[0].getName(), equalTo("lowercase"));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens().length, equalTo(3));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getTerm(), equalTo("troubl"));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getTerm(), equalTo("troubled"));
String[] expectedAttributesKey = {
"bytes",
"positionLength",
"keyword"};
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getAttributes().size(), equalTo(expectedAttributesKey.length));
Object extendedAttribute;
for (String key : expectedAttributesKey) {
extendedAttribute = analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getAttributes().get(key);
assertThat(extendedAttribute, notNullValue());
}
}
public void testDetailAnalyzeSpecifyAttributes() throws Exception {
AnalyzeResponse analyzeResponse = client().admin().indices().prepareAnalyze("This is troubled")
.setExplain(true).setTokenizer("standard").addTokenFilter("snowball").setAttributes("keyword").get();
assertThat(analyzeResponse.detail().tokenfilters().length, equalTo(1));
assertThat(analyzeResponse.detail().tokenfilters()[0].getName(), equalTo("snowball"));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens().length, equalTo(3));
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getTerm(), equalTo("troubl"));
String[] expectedAttributesKey = {
"keyword"};
"positionLength"};
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getAttributes().size(), equalTo(expectedAttributesKey.length));
Object extendedAttribute;

View File

@ -89,6 +89,10 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
filters.put("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new));
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put("snowball", SnowballTokenFilterFactory::new);
filters.put("trim", TrimTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
return filters;

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.util.Set;
import java.util.regex.Pattern;
@ -50,7 +52,7 @@ public class KeywordMarkerTokenFilterFactory extends AbstractTokenFilterFactory
private final CharArraySet keywordLookup;
private final Pattern keywordPattern;
public KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
boolean ignoreCase =

View File

@ -17,17 +17,18 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class PorterStemTokenFilterFactory extends AbstractTokenFilterFactory {
public PorterStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
PorterStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
@ -24,6 +24,7 @@ import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
/**
* Real work actually done here by Sebastian on the Elasticsearch mailing list
@ -33,7 +34,7 @@ public class SnowballTokenFilterFactory extends AbstractTokenFilterFactory {
private String language;
public SnowballTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
SnowballTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.language = Strings.capitalize(settings.get("language", settings.get("name", "English")));
}

View File

@ -17,19 +17,20 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class TrimTokenFilterFactory extends AbstractTokenFilterFactory {
private static final String UPDATE_OFFSETS_KEY = "update_offsets";
public TrimTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
TrimTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
if (settings.get(UPDATE_OFFSETS_KEY) != null) {
throw new IllegalArgumentException(UPDATE_OFFSETS_KEY + " is not supported anymore. Please fix your analysis chain");

View File

@ -49,6 +49,10 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
protected Map<String, Class<?>> getTokenFilters() {
Map<String, Class<?>> filters = new TreeMap<>(super.getTokenFilters());
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
filters.put("keywordmarker", KeywordMarkerTokenFilterFactory.class);
filters.put("porterstem", PorterStemTokenFilterFactory.class);
filters.put("snowballporter", SnowballTokenFilterFactory.class);
filters.put("trim", TrimTokenFilterFactory.class);
filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class);
filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class);
return filters;
@ -120,7 +124,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
@Override
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
return filters;
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
@ -25,6 +25,9 @@ import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase.TestAnalysis;
import org.elasticsearch.test.ESTokenStreamTestCase;
@ -49,7 +52,7 @@ public class KeywordMarkerFilterFactoryTests extends ESTokenStreamTestCase {
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
@ -72,7 +75,7 @@ public class KeywordMarkerFilterFactoryTests extends ESTokenStreamTestCase {
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
@ -96,7 +99,7 @@ public class KeywordMarkerFilterFactoryTests extends ESTokenStreamTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
assertEquals("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`",
e.getMessage());
}

View File

@ -33,6 +33,70 @@
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar! }
---
"porterstem":
- do:
indices.analyze:
body:
text: This is troubled
tokenizer: standard
filter: [porter_stem]
- length: { tokens: 3 }
- match: { tokens.2.token: troubl }
- match: { tokens.2.position: 2 }
---
"keywordmarker":
- do:
indices.analyze:
body:
text: This is troubled
tokenizer: standard
filter:
- type: keyword_marker
keywords: troubled
- type: porter_stem
- length: { tokens: 3 }
- match: { tokens.2.token: troubled }
- match: { tokens.2.position: 2 }
---
"snowball":
- do:
indices.analyze:
body:
text: This is troubled
tokenizer: standard
filter: [snowball]
- length: { tokens: 3 }
- match: { tokens.2.token: troubl }
- match: { tokens.2.position: 2 }
- do:
indices.analyze:
body:
explain: true
text: This is troubled
tokenizer: standard
filter: [snowball]
- length: { detail.tokenfilters.0.tokens: 3 }
- match: { detail.tokenfilters.0.tokens.2.token: troubl }
- match: { detail.tokenfilters.0.tokens.2.position: 2 }
- is_true: detail.tokenfilters.0.tokens.2.bytes
- match: { detail.tokenfilters.0.tokens.2.positionLength: 1 }
- match: { detail.tokenfilters.0.tokens.2.keyword: false }
---
"trim":
- do:
indices.analyze:
body:
text: Foo Bar !
tokenizer: keyword
filter: [trim]
- length: { tokens: 1 }
- match: { tokens.0.token: Foo Bar ! }
---
"word_delimiter":
- do:

View File

@ -48,7 +48,6 @@ import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordMarkerTokenFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LengthTokenFilterFactory;
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
@ -64,7 +63,6 @@ import org.elasticsearch.index.analysis.PatternCaptureGroupTokenFilterFactory;
import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory;
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
@ -72,7 +70,6 @@ import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SnowballTokenFilterFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
@ -82,7 +79,6 @@ import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TrimTokenFilterFactory;
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
@ -193,7 +189,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("indonesianstem", StemmerTokenFilterFactory.class)
.put("italianlightstem", StemmerTokenFilterFactory.class)
.put("keepword", KeepWordFilterFactory.class)
.put("keywordmarker", KeywordMarkerTokenFilterFactory.class)
.put("keywordmarker", MovedToAnalysisCommon.class)
.put("kstem", KStemTokenFilterFactory.class)
.put("latvianstem", StemmerTokenFilterFactory.class)
.put("length", LengthTokenFilterFactory.class)
@ -205,7 +201,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class)
.put("patternreplace", PatternReplaceTokenFilterFactory.class)
.put("persiannormalization", PersianNormalizationFilterFactory.class)
.put("porterstem", PorterStemTokenFilterFactory.class)
.put("porterstem", MovedToAnalysisCommon.class)
.put("portuguesestem", StemmerTokenFilterFactory.class)
.put("portugueselightstem", StemmerTokenFilterFactory.class)
.put("portugueseminimalstem", StemmerTokenFilterFactory.class)
@ -216,7 +212,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("serbiannormalization", SerbianNormalizationFilterFactory.class)
.put("shingle", ShingleTokenFilterFactory.class)
.put("minhash", MinHashTokenFilterFactory.class)
.put("snowballporter", SnowballTokenFilterFactory.class)
.put("snowballporter", MovedToAnalysisCommon.class)
.put("soraninormalization", SoraniNormalizationFilterFactory.class)
.put("soranistem", StemmerTokenFilterFactory.class)
.put("spanishlightstem", StemmerTokenFilterFactory.class)
@ -226,7 +222,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("swedishlightstem", StemmerTokenFilterFactory.class)
.put("synonym", SynonymTokenFilterFactory.class)
.put("synonymgraph", SynonymGraphTokenFilterFactory.class)
.put("trim", TrimTokenFilterFactory.class)
.put("trim", MovedToAnalysisCommon.class)
.put("truncate", TruncateTokenFilterFactory.class)
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
.put("type", KeepTypesFilterFactory.class)