diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index 6526f378476..5c5a19813dd 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -1,59 +1,8 @@ [[analysis-stemmer-tokenfilter]] === Stemmer Token Filter -A filter that stems words (similar to `snowball`, but with more -options). The `language`/`name` parameter controls the stemmer with the -following available values: - -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[arabic], -http://snowball.tartarus.org/algorithms/armenian/stemmer.html[armenian], -http://snowball.tartarus.org/algorithms/basque/stemmer.html[basque], -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[brazilian], -http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf[bulgarian], -http://snowball.tartarus.org/algorithms/catalan/stemmer.html[catalan], -http://portal.acm.org/citation.cfm?id=1598600[czech], -http://snowball.tartarus.org/algorithms/danish/stemmer.html[danish], -http://snowball.tartarus.org/algorithms/dutch/stemmer.html[dutch], -http://snowball.tartarus.org/algorithms/english/stemmer.html[english], -http://snowball.tartarus.org/algorithms/finnish/stemmer.html[finnish], -http://snowball.tartarus.org/algorithms/french/stemmer.html[french], -http://snowball.tartarus.org/algorithms/german/stemmer.html[german], -http://snowball.tartarus.org/algorithms/german2/stemmer.html[german2], -http://sais.se/mthprize/2007/ntais2007.pdf[greek], -http://snowball.tartarus.org/algorithms/hungarian/stemmer.html[hungarian], -http://snowball.tartarus.org/algorithms/italian/stemmer.html[italian], -http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[kp], -http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[kstem], -http://snowball.tartarus.org/algorithms/lovins/stemmer.html[lovins], -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[latvian], -http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[norwegian], -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[minimal_norwegian], -http://snowball.tartarus.org/algorithms/porter/stemmer.html[porter], -http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[portuguese], -http://snowball.tartarus.org/algorithms/romanian/stemmer.html[romanian], -http://snowball.tartarus.org/algorithms/russian/stemmer.html[russian], -http://snowball.tartarus.org/algorithms/spanish/stemmer.html[spanish], -http://snowball.tartarus.org/algorithms/swedish/stemmer.html[swedish], -http://snowball.tartarus.org/algorithms/turkish/stemmer.html[turkish], -http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[minimal_english], -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[possessive_english], -http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_finnish], -http://dl.acm.org/citation.cfm?id=1141523[light_french], -http://dl.acm.org/citation.cfm?id=318984[minimal_french], -http://dl.acm.org/citation.cfm?id=1141523[light_german], -http://members.unine.ch/jacques.savoy/clef/morpho.pdf[minimal_german], -http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf[hindi], -http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_hungarian], -http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[indonesian], -http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_italian], -http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_portuguese], -http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[minimal_portuguese], -http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[portuguese], -http://doc.rero.ch/lm.php?url=1000%2C43%2C4%2C20091209094227-CA%2FDolamic_Ljiljana_-_Indexing_and_Searching_Strategies_for_the_Russian_20091209.pdf[light_russian], -http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_spanish], -http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_swedish]. - -For example: +A filter that provides access to (almost) all of the available stemming token +filters through a single unified interface. For example: [source,js] -------------------------------------------------- @@ -76,3 +25,134 @@ For example: } } -------------------------------------------------- + +The `language`/`name` parameter controls the stemmer with the following +available values (the preferred filters are marked in *bold*): + +[horizontal] +Arabic:: + +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[*`arabic`*] + +Armenian:: + +http://snowball.tartarus.org/algorithms/armenian/stemmer.html[*`armenian`*] + +Basque:: + +http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*] + +Brazilian Portuguese:: + +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[*`brazilian`*] + +Bulgarian:: + +http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf[*`bulgarian`*] + +Catalan:: + +http://snowball.tartarus.org/algorithms/catalan/stemmer.html[*`catalan`*] + +Czech:: + +http://portal.acm.org/citation.cfm?id=1598600[*`czech`*] + +Danish:: + +http://snowball.tartarus.org/algorithms/danish/stemmer.html[*`danish`*] + +Dutch:: + +http://snowball.tartarus.org/algorithms/dutch/stemmer.html[*`dutch`*], +http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[`dutch_kp`] coming[1.3.0,Renamed from `kp`] + +English:: + +http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*] coming[1.3.0,Returns the <> instead of the <>], +http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`] coming[1.3.0,Returns the <>], +http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[`minimal_english`], +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[`possessive_english`], +http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`] coming[1.3.0,Returns the <> instead of the <>], +http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`] + +Finnish:: + +http://snowball.tartarus.org/algorithms/finnish/stemmer.html[*`finnish`*], +http://clef.isti.cnr.it/2003/WN_web/22.pdf[`light_finnish`] + +French:: + +http://snowball.tartarus.org/algorithms/french/stemmer.html[`french`], +http://dl.acm.org/citation.cfm?id=1141523[*`light_french`*], +http://dl.acm.org/citation.cfm?id=318984[`minimal_french`] + +German:: + +http://snowball.tartarus.org/algorithms/german/stemmer.html[`german`], +http://snowball.tartarus.org/algorithms/german2/stemmer.html[`german2`], +http://dl.acm.org/citation.cfm?id=1141523[*`light_german`*], +http://members.unine.ch/jacques.savoy/clef/morpho.pdf[`minimal_german`] + +Greek:: + +http://sais.se/mthprize/2007/ntais2007.pdf[*`greek`*] + +Hindi:: + +http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf[*`hindi`*] + +Hungarian:: + +http://snowball.tartarus.org/algorithms/hungarian/stemmer.html[*`hungarian`*], +http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[`light_hungarian`] + +Indonesian:: + +http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[*`indonesian`*] + +Italian:: + +http://snowball.tartarus.org/algorithms/italian/stemmer.html[`italian`], +http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[*`light_italian`*] + +Latvian:: + +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[*`latvian`*] + +Norwegian:: + +http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[*`norwegian`*], +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[`minimal_norwegian`] + +Portuguese:: + +http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[`portuguese`], +http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[*`light_portuguese`*], +http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`], +http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[`portuguese_rslp`] coming[1.3.0] + + +Romanian:: + +http://snowball.tartarus.org/algorithms/romanian/stemmer.html[*`romanian`*] + +Russian:: + +http://snowball.tartarus.org/algorithms/russian/stemmer.html[*`russian`*], +http://doc.rero.ch/lm.php?url=1000%2C43%2C4%2C20091209094227-CA%2FDolamic_Ljiljana_-_Indexing_and_Searching_Strategies_for_the_Russian_20091209.pdf[`light_russian`] + +Spanish:: + +http://snowball.tartarus.org/algorithms/spanish/stemmer.html[`spanish`], +http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[*`light_spanish`*] + +Swedish:: + +http://snowball.tartarus.org/algorithms/swedish/stemmer.html[*`swedish`*], +http://clef.isti.cnr.it/2003/WN_web/22.pdf[`light_swedish`] + +Turkish:: + +http://snowball.tartarus.org/algorithms/turkish/stemmer.html[*`turkish`*] + diff --git a/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java index 9045ed256c1..a260e985204 100644 --- a/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java @@ -47,6 +47,8 @@ import org.apache.lucene.analysis.pt.PortugueseStemFilter; import org.apache.lucene.analysis.ru.RussianLightStemFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.sv.SwedishLightStemFilter; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.Strings; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; @@ -69,6 +71,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { @Override public TokenStream create(TokenStream tokenStream) { + final Version indexVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT); + if ("arabic".equalsIgnoreCase(language)) { return new ArabicStemFilter(tokenStream); } else if ("armenian".equalsIgnoreCase(language)) { @@ -85,90 +89,129 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { return new CzechStemFilter(tokenStream); } else if ("danish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DanishStemmer()); + + // Dutch stemmers } else if ("dutch".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DutchStemmer()); - } else if ("english".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new EnglishStemmer()); - } else if ("finnish".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new FinnishStemmer()); - } else if ("french".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new FrenchStemmer()); - } else if ("german".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new GermanStemmer()); - } else if ("german2".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new German2Stemmer()); - } else if ("hungarian".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new HungarianStemmer()); - } else if ("italian".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new ItalianStemmer()); - } else if ("kp".equalsIgnoreCase(language)) { + } else if ("dutch_kp".equalsIgnoreCase(language) || "dutchKp".equalsIgnoreCase(language) || "kp".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new KpStemmer()); - } else if ("kstem".equalsIgnoreCase(language)) { + + // English stemmers + } else if ("english".equalsIgnoreCase(language)) { + if (indexVersion.onOrAfter(Version.V_1_3_0)) { + return new PorterStemFilter(tokenStream); + } else { + return new SnowballFilter(tokenStream, new EnglishStemmer()); + } + } else if ("light_english".equalsIgnoreCase(language) || "lightEnglish".equalsIgnoreCase(language) + || "kstem".equalsIgnoreCase(language)) { return new KStemFilter(tokenStream); } else if ("lovins".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new LovinsStemmer()); - } else if ("latvian".equalsIgnoreCase(language)) { - return new LatvianStemFilter(tokenStream); - } else if ("norwegian".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new NorwegianStemmer()); - } else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) { - return new NorwegianMinimalStemFilter(tokenStream); } else if ("porter".equalsIgnoreCase(language)) { return new PorterStemFilter(tokenStream); } else if ("porter2".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new PorterStemmer()); - } else if ("portuguese".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new PortugueseStemmer()); - } else if ("romanian".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new RomanianStemmer()); - } else if ("russian".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new RussianStemmer()); - } else if ("spanish".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new SpanishStemmer()); - } else if ("swedish".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new SwedishStemmer()); - } else if ("turkish".equalsIgnoreCase(language)) { - return new SnowballFilter(tokenStream, new TurkishStemmer()); + if (indexVersion.onOrAfter(Version.V_1_3_0)) { + return new SnowballFilter(tokenStream, new EnglishStemmer()); + } else { + return new SnowballFilter(tokenStream, new PorterStemmer()); + } } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(version, tokenStream); + + // Finnish stemmers + } else if ("finnish".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new FinnishStemmer()); } else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) { // leaving this for backward compatibility return new FinnishLightStemFilter(tokenStream); } else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) { return new FinnishLightStemFilter(tokenStream); + + // French stemmers + } else if ("french".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new FrenchStemmer()); } else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) { return new FrenchLightStemFilter(tokenStream); } else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) { return new FrenchMinimalStemFilter(tokenStream); + + // German stemmers + } else if ("german".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new GermanStemmer()); + } else if ("german2".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new German2Stemmer()); } else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) { return new GermanLightStemFilter(tokenStream); } else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) { return new GermanMinimalStemFilter(tokenStream); + + } else if ("greek".equalsIgnoreCase(language)) { + return new GreekStemFilter(tokenStream); } else if ("hindi".equalsIgnoreCase(language)) { return new HindiStemFilter(tokenStream); + + // Hungarian stemmers + } else if ("hungarian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new HungarianStemmer()); } else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) { return new HungarianLightStemFilter(tokenStream); + } else if ("indonesian".equalsIgnoreCase(language)) { return new IndonesianStemFilter(tokenStream); + + // Italian stemmers + } else if ("italian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new ItalianStemmer()); } else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) { return new ItalianLightStemFilter(tokenStream); + + } else if ("latvian".equalsIgnoreCase(language)) { + return new LatvianStemFilter(tokenStream); + + // Norwegian stemmers + } else if ("norwegian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new NorwegianStemmer()); + } else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) { + return new NorwegianMinimalStemFilter(tokenStream); + + // Portuguese stemmers + } else if ("portuguese".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new PortugueseStemmer()); } else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) { return new PortugueseLightStemFilter(tokenStream); } else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) { return new PortugueseMinimalStemFilter(tokenStream); - } else if ("portuguese".equalsIgnoreCase(language)) { + } else if ("portuguese_rslp".equalsIgnoreCase(language)) { return new PortugueseStemFilter(tokenStream); + + } else if ("romanian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new RomanianStemmer()); + + // Russian stemmers + } else if ("russian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new RussianStemmer()); } else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) { return new RussianLightStemFilter(tokenStream); + + // Spanish stemmers + } else if ("spanish".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new SpanishStemmer()); } else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) { return new SpanishLightStemFilter(tokenStream); + + // Swedish stemmers + } else if ("swedish".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new SwedishStemmer()); } else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) { return new SwedishLightStemFilter(tokenStream); - } else if ("greek".equalsIgnoreCase(language)) { - return new GreekStemFilter(tokenStream); + + } else if ("turkish".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new TurkishStemmer()); } + return new SnowballFilter(tokenStream, language); } diff --git a/src/test/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactoryTests.java new file mode 100644 index 00000000000..1ceaea7cef1 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactoryTests.java @@ -0,0 +1,105 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.elasticsearch.Version; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.elasticsearch.test.ElasticsearchTokenStreamTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_CREATED; +import static org.hamcrest.Matchers.instanceOf; + +/** + * + */ +public class StemmerTokenFilterFactoryTests extends ElasticsearchTokenStreamTestCase { + + @Test + public void testEnglishBackwardsCompatibility() throws IOException { + int iters = scaledRandomIntBetween(20, 100); + for (int i = 0; i < iters; i++) { + + Version v = ElasticsearchTestCase.randomVersion(random()); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.my_english.type", "stemmer") + .put("index.analysis.filter.my_english.language", "english") + .put("index.analysis.analyzer.my_english.tokenizer","whitespace") + .put("index.analysis.analyzer.my_english.filter","my_english") + .put(SETTING_VERSION_CREATED,v) + .build(); + + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_english"); + assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); + TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo bar"))); + NamedAnalyzer analyzer = analysisService.analyzer("my_english"); + + if (v.onOrAfter(Version.V_1_3_0)) { + assertThat(create, instanceOf(PorterStemFilter.class)); + assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"}); + } else { + assertThat(create, instanceOf(SnowballFilter.class)); + assertAnalyzesTo(analyzer, "consolingly", new String[]{"consol"}); + } + } + + } + + @Test + public void testPorter2BackwardsCompatibility() throws IOException { + int iters = scaledRandomIntBetween(20, 100); + for (int i = 0; i < iters; i++) { + + Version v = ElasticsearchTestCase.randomVersion(random()); + Settings settings = ImmutableSettings.settingsBuilder() + .put("index.analysis.filter.my_porter2.type", "stemmer") + .put("index.analysis.filter.my_porter2.language", "porter2") + .put("index.analysis.analyzer.my_porter2.tokenizer","whitespace") + .put("index.analysis.analyzer.my_porter2.filter","my_porter2") + .put(SETTING_VERSION_CREATED,v) + .build(); + + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_porter2"); + assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); + TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo bar"))); + NamedAnalyzer analyzer = analysisService.analyzer("my_porter2"); + assertThat(create, instanceOf(SnowballFilter.class)); + + if (v.onOrAfter(Version.V_1_3_0)) { + assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"}); + } else { + assertAnalyzesTo(analyzer, "possibly", new String[]{"possibli"}); + } + } + + } + +}