Move more token filters to analysis-common module

The following token filters were moved: arabic_stem, brazilian_stem, czech_stem, dutch_stem, french_stem, german_stem and russian_stem.

Relates to #23658
This commit is contained in:
Martijn van Groningen 2017-08-03 15:00:56 +02:00
parent 7e3cd6a019
commit 1146a35870
No known key found for this signature in database
GPG Key ID: AB236F4FCF2AF12A
15 changed files with 256 additions and 51 deletions

View File

@ -30,11 +30,9 @@ import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
import org.elasticsearch.index.analysis.CharFilterFactory;
@ -42,19 +40,15 @@ import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
import org.elasticsearch.index.analysis.FrenchAnalyzerProvider;
import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.GalicianAnalyzerProvider;
import org.elasticsearch.index.analysis.GermanAnalyzerProvider;
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
import org.elasticsearch.index.analysis.GreekAnalyzerProvider;
import org.elasticsearch.index.analysis.HindiAnalyzerProvider;
import org.elasticsearch.index.analysis.HungarianAnalyzerProvider;
@ -80,7 +74,6 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
@ -172,14 +165,6 @@ public final class AnalysisModule {
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("standard", StandardTokenFilterFactory::new);
tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
tokenFilters.register("dutch_stem", DutchStemTokenFilterFactory::new);
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory
(indexSettings, name, settings, hunspellService)));

View File

@ -188,14 +188,6 @@ public class AnalysisModuleTests extends ESTestCase {
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom4 = (CustomAnalyzer) analyzer;
assertThat(custom4.tokenFilters()[0], instanceOf(MyFilterTokenFilterFactory.class));
// // verify Czech stemmer
// analyzer = analysisService.analyzer("czechAnalyzerWithStemmer").analyzer();
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
// CustomAnalyzer czechstemmeranalyzer = (CustomAnalyzer) analyzer;
// assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
// assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
}
public void testWordListPath() throws Exception {

View File

@ -35,10 +35,6 @@
"custom6":{
"tokenizer":"standard",
"position_increment_gap": 256
},
"czechAnalyzerWithStemmer":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "stop", "czech_stem"]
}
}
}

View File

@ -28,6 +28,3 @@ index :
custom7 :
type : standard
version: 3.6
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]

View File

@ -17,17 +17,18 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class ArabicStemTokenFilterFactory extends AbstractTokenFilterFactory {
public ArabicStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ArabicStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.br.BrazilianStemFilter;
@ -26,12 +26,14 @@ import org.apache.lucene.analysis.CharArraySet;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
public class BrazilianStemTokenFilterFactory extends AbstractTokenFilterFactory {
private final CharArraySet exclusions;
public BrazilianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
BrazilianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET);
}

View File

@ -92,20 +92,26 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
filters.put("apostrophe", ApostropheFilterFactory::new);
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
filters.put("cjk_bigram", CJKBigramFilterFactory::new);
filters.put("cjk_width", CJKWidthFilterFactory::new);
filters.put("classic", ClassicFilterFactory::new);
filters.put("czech_stem", CzechStemTokenFilterFactory::new);
filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
filters.put("decimal_digit", DecimalDigitFilterFactory::new);
filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
filters.put("dutch_stem", DutchStemTokenFilterFactory::new);
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
filters.put("elision", ElisionTokenFilterFactory::new);
filters.put("fingerprint", FingerprintTokenFilterFactory::new);
filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new);
filters.put("french_stem", FrenchStemTokenFilterFactory::new);
filters.put("german_normalization", GermanNormalizationFilterFactory::new);
filters.put("german_stem", GermanStemTokenFilterFactory::new);
filters.put("hindi_normalization", HindiNormalizationFilterFactory::new);
filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
filters.put("indic_normalization", IndicNormalizationFilterFactory::new);
@ -124,6 +130,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put("reverse", ReverseTokenFilterFactory::new);
filters.put("russian_stem", RussianStemTokenFilterFactory::new);
filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new);

View File

@ -16,17 +16,18 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class CzechStemTokenFilterFactory extends AbstractTokenFilterFactory {
public CzechStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
CzechStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -26,13 +26,15 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.tartarus.snowball.ext.DutchStemmer;
public class DutchStemTokenFilterFactory extends AbstractTokenFilterFactory {
private final CharArraySet exclusions;
public DutchStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
DutchStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -26,13 +26,15 @@ import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.tartarus.snowball.ext.FrenchStemmer;
public class FrenchStemTokenFilterFactory extends AbstractTokenFilterFactory {
private final CharArraySet exclusions;
public FrenchStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
FrenchStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -26,12 +26,14 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
public class GermanStemTokenFilterFactory extends AbstractTokenFilterFactory {
private final CharArraySet exclusions;
public GermanStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
GermanStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET);
}

View File

@ -17,13 +17,14 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
public class RussianStemTokenFilterFactory extends AbstractTokenFilterFactory {

View File

@ -124,6 +124,10 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
filters.put("fingerprint", FingerprintTokenFilterFactory.class);
filters.put("minhash", MinHashTokenFilterFactory.class);
filters.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class);
filters.put("arabicstem", ArabicStemTokenFilterFactory.class);
filters.put("brazilianstem", BrazilianStemTokenFilterFactory.class);
filters.put("czechstem", CzechStemTokenFilterFactory.class);
filters.put("germanstem", GermanStemTokenFilterFactory.class);
return filters;
}

View File

@ -1258,3 +1258,220 @@
filter: [scandinavian_folding]
- length: { tokens: 1 }
- match: { tokens.0.token: raksmorgas }
---
"arabic_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_arabic_stem:
type: arabic_stem
- do:
indices.analyze:
index: test
body:
text: الحسن
tokenizer: keyword
filter: [my_arabic_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: حسن }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: الحسن
tokenizer: keyword
filter: [arabic_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: حسن }
---
"brazilian_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_brazilian_stem:
type: brazilian_stem
- do:
indices.analyze:
index: test
body:
text: Brasília
tokenizer: keyword
filter: [my_brazilian_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: brasil }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: Brasília
tokenizer: keyword
filter: [brazilian_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: brasil }
---
"czech_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_czech_stem:
type: czech_stem
- do:
indices.analyze:
index: test
body:
text: angličtí
tokenizer: keyword
filter: [my_czech_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: anglick }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: angličtí
tokenizer: keyword
filter: [czech_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: anglick }
---
"dutch_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_dutch_stem:
type: dutch_stem
- do:
indices.analyze:
index: test
body:
text: ophouden
tokenizer: keyword
filter: [my_dutch_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: ophoud }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: ophouden
tokenizer: keyword
filter: [dutch_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: ophoud }
---
"french_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_french_stem:
type: french_stem
- do:
indices.analyze:
index: test
body:
text: chevaux
tokenizer: keyword
filter: [my_french_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: cheval }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: chevaux
tokenizer: keyword
filter: [french_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: cheval }
---
"german_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_german_stem:
type: german_stem
- do:
indices.analyze:
index: test
body:
text: abschließen
tokenizer: keyword
filter: [my_german_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: abschliess }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: abschließen
tokenizer: keyword
filter: [german_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: abschliess }
---
"russian_stem":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_russian_stem:
type: russian_stem
- do:
indices.analyze:
index: test
body:
text: журналы
tokenizer: keyword
filter: [my_russian_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: журнал }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: журналы
tokenizer: keyword
filter: [russian_stem]
- length: { tokens: 1 }
- match: { tokens.0.token: журнал }

View File

@ -23,12 +23,8 @@ import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
@ -114,16 +110,16 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
// exposed in ES
.put("apostrophe", MovedToAnalysisCommon.class)
.put("arabicnormalization", MovedToAnalysisCommon.class)
.put("arabicstem", ArabicStemTokenFilterFactory.class)
.put("arabicstem", MovedToAnalysisCommon.class)
.put("asciifolding", MovedToAnalysisCommon.class)
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
.put("brazilianstem", MovedToAnalysisCommon.class)
.put("bulgarianstem", MovedToAnalysisCommon.class)
.put("cjkbigram", MovedToAnalysisCommon.class)
.put("cjkwidth", MovedToAnalysisCommon.class)
.put("classic", MovedToAnalysisCommon.class)
.put("commongrams", MovedToAnalysisCommon.class)
.put("commongramsquery", MovedToAnalysisCommon.class)
.put("czechstem", CzechStemTokenFilterFactory.class)
.put("czechstem", MovedToAnalysisCommon.class)
.put("decimaldigit", MovedToAnalysisCommon.class)
.put("delimitedpayload", MovedToAnalysisCommon.class)
.put("dictionarycompoundword", MovedToAnalysisCommon.class)
@ -136,7 +132,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("frenchminimalstem", MovedToAnalysisCommon.class)
.put("galicianminimalstem", MovedToAnalysisCommon.class)
.put("galicianstem", MovedToAnalysisCommon.class)
.put("germanstem", GermanStemTokenFilterFactory.class)
.put("germanstem", MovedToAnalysisCommon.class)
.put("germanlightstem", MovedToAnalysisCommon.class)
.put("germanminimalstem", MovedToAnalysisCommon.class)
.put("germannormalization", MovedToAnalysisCommon.class)