The StemmerTokenFilter had a number of issues:
* `english` returned the slow snowball English stemmer * `porter2` returned the snowball Porter stemmer (v1) * `portuguese` was used twice, preventing the second version from working Changes: * `english` now returns the fast PorterStemmer (for indices created from v1.3.0 onwards) * `porter2` now returns the snowball English stemmer (for indices created from v1.3.0 onwards) * `light_english` now returns the `kstem` stemmer (`kstem` still works) * `portuguese_rslp` returns the PortugueseStemmer * `dutch_kp` is a synonym for `kp` Tests and docs updated Fixes #6345 Fixes #6213 Fixes #6330
This commit is contained in:
parent
c25de57d5d
commit
673ef3db3f
|
@ -1,59 +1,8 @@
|
||||||
[[analysis-stemmer-tokenfilter]]
|
[[analysis-stemmer-tokenfilter]]
|
||||||
=== Stemmer Token Filter
|
=== Stemmer Token Filter
|
||||||
|
|
||||||
A filter that stems words (similar to `snowball`, but with more
|
A filter that provides access to (almost) all of the available stemming token
|
||||||
options). The `language`/`name` parameter controls the stemmer with the
|
filters through a single unified interface. For example:
|
||||||
following available values:
|
|
||||||
|
|
||||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[arabic],
|
|
||||||
http://snowball.tartarus.org/algorithms/armenian/stemmer.html[armenian],
|
|
||||||
http://snowball.tartarus.org/algorithms/basque/stemmer.html[basque],
|
|
||||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[brazilian],
|
|
||||||
http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf[bulgarian],
|
|
||||||
http://snowball.tartarus.org/algorithms/catalan/stemmer.html[catalan],
|
|
||||||
http://portal.acm.org/citation.cfm?id=1598600[czech],
|
|
||||||
http://snowball.tartarus.org/algorithms/danish/stemmer.html[danish],
|
|
||||||
http://snowball.tartarus.org/algorithms/dutch/stemmer.html[dutch],
|
|
||||||
http://snowball.tartarus.org/algorithms/english/stemmer.html[english],
|
|
||||||
http://snowball.tartarus.org/algorithms/finnish/stemmer.html[finnish],
|
|
||||||
http://snowball.tartarus.org/algorithms/french/stemmer.html[french],
|
|
||||||
http://snowball.tartarus.org/algorithms/german/stemmer.html[german],
|
|
||||||
http://snowball.tartarus.org/algorithms/german2/stemmer.html[german2],
|
|
||||||
http://sais.se/mthprize/2007/ntais2007.pdf[greek],
|
|
||||||
http://snowball.tartarus.org/algorithms/hungarian/stemmer.html[hungarian],
|
|
||||||
http://snowball.tartarus.org/algorithms/italian/stemmer.html[italian],
|
|
||||||
http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[kp],
|
|
||||||
http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[kstem],
|
|
||||||
http://snowball.tartarus.org/algorithms/lovins/stemmer.html[lovins],
|
|
||||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[latvian],
|
|
||||||
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[norwegian],
|
|
||||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[minimal_norwegian],
|
|
||||||
http://snowball.tartarus.org/algorithms/porter/stemmer.html[porter],
|
|
||||||
http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[portuguese],
|
|
||||||
http://snowball.tartarus.org/algorithms/romanian/stemmer.html[romanian],
|
|
||||||
http://snowball.tartarus.org/algorithms/russian/stemmer.html[russian],
|
|
||||||
http://snowball.tartarus.org/algorithms/spanish/stemmer.html[spanish],
|
|
||||||
http://snowball.tartarus.org/algorithms/swedish/stemmer.html[swedish],
|
|
||||||
http://snowball.tartarus.org/algorithms/turkish/stemmer.html[turkish],
|
|
||||||
http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[minimal_english],
|
|
||||||
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[possessive_english],
|
|
||||||
http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_finnish],
|
|
||||||
http://dl.acm.org/citation.cfm?id=1141523[light_french],
|
|
||||||
http://dl.acm.org/citation.cfm?id=318984[minimal_french],
|
|
||||||
http://dl.acm.org/citation.cfm?id=1141523[light_german],
|
|
||||||
http://members.unine.ch/jacques.savoy/clef/morpho.pdf[minimal_german],
|
|
||||||
http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf[hindi],
|
|
||||||
http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_hungarian],
|
|
||||||
http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[indonesian],
|
|
||||||
http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_italian],
|
|
||||||
http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_portuguese],
|
|
||||||
http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[minimal_portuguese],
|
|
||||||
http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[portuguese],
|
|
||||||
http://doc.rero.ch/lm.php?url=1000%2C43%2C4%2C20091209094227-CA%2FDolamic_Ljiljana_-_Indexing_and_Searching_Strategies_for_the_Russian_20091209.pdf[light_russian],
|
|
||||||
http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_spanish],
|
|
||||||
http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_swedish].
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
@ -76,3 +25,134 @@ For example:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
|
The `language`/`name` parameter controls the stemmer with the following
|
||||||
|
available values (the preferred filters are marked in *bold*):
|
||||||
|
|
||||||
|
[horizontal]
|
||||||
|
Arabic::
|
||||||
|
|
||||||
|
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[*`arabic`*]
|
||||||
|
|
||||||
|
Armenian::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/armenian/stemmer.html[*`armenian`*]
|
||||||
|
|
||||||
|
Basque::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
|
||||||
|
|
||||||
|
Brazilian Portuguese::
|
||||||
|
|
||||||
|
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[*`brazilian`*]
|
||||||
|
|
||||||
|
Bulgarian::
|
||||||
|
|
||||||
|
http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf[*`bulgarian`*]
|
||||||
|
|
||||||
|
Catalan::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/catalan/stemmer.html[*`catalan`*]
|
||||||
|
|
||||||
|
Czech::
|
||||||
|
|
||||||
|
http://portal.acm.org/citation.cfm?id=1598600[*`czech`*]
|
||||||
|
|
||||||
|
Danish::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/danish/stemmer.html[*`danish`*]
|
||||||
|
|
||||||
|
Dutch::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/dutch/stemmer.html[*`dutch`*],
|
||||||
|
http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[`dutch_kp`] coming[1.3.0,Renamed from `kp`]
|
||||||
|
|
||||||
|
English::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*] coming[1.3.0,Returns the <<analysis-porterstem-tokenfilter,`porter_stem`>> instead of the <<analysis-snowball-tokenfilter,`english` Snowball token filter>>],
|
||||||
|
http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`] coming[1.3.0,Returns the <<analysis-kstem-tokenfilter,`kstem` token filter>>],
|
||||||
|
http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[`minimal_english`],
|
||||||
|
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[`possessive_english`],
|
||||||
|
http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`] coming[1.3.0,Returns the <<analysis-snowball-tokenfilter,`english` Snowball token filter>> instead of the <<analysis-snowball-tokenfilter,`porter` Snowball token filter>>],
|
||||||
|
http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`]
|
||||||
|
|
||||||
|
Finnish::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/finnish/stemmer.html[*`finnish`*],
|
||||||
|
http://clef.isti.cnr.it/2003/WN_web/22.pdf[`light_finnish`]
|
||||||
|
|
||||||
|
French::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/french/stemmer.html[`french`],
|
||||||
|
http://dl.acm.org/citation.cfm?id=1141523[*`light_french`*],
|
||||||
|
http://dl.acm.org/citation.cfm?id=318984[`minimal_french`]
|
||||||
|
|
||||||
|
German::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/german/stemmer.html[`german`],
|
||||||
|
http://snowball.tartarus.org/algorithms/german2/stemmer.html[`german2`],
|
||||||
|
http://dl.acm.org/citation.cfm?id=1141523[*`light_german`*],
|
||||||
|
http://members.unine.ch/jacques.savoy/clef/morpho.pdf[`minimal_german`]
|
||||||
|
|
||||||
|
Greek::
|
||||||
|
|
||||||
|
http://sais.se/mthprize/2007/ntais2007.pdf[*`greek`*]
|
||||||
|
|
||||||
|
Hindi::
|
||||||
|
|
||||||
|
http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf[*`hindi`*]
|
||||||
|
|
||||||
|
Hungarian::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/hungarian/stemmer.html[*`hungarian`*],
|
||||||
|
http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[`light_hungarian`]
|
||||||
|
|
||||||
|
Indonesian::
|
||||||
|
|
||||||
|
http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[*`indonesian`*]
|
||||||
|
|
||||||
|
Italian::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/italian/stemmer.html[`italian`],
|
||||||
|
http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[*`light_italian`*]
|
||||||
|
|
||||||
|
Latvian::
|
||||||
|
|
||||||
|
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[*`latvian`*]
|
||||||
|
|
||||||
|
Norwegian::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[*`norwegian`*],
|
||||||
|
http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[`minimal_norwegian`]
|
||||||
|
|
||||||
|
Portuguese::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[`portuguese`],
|
||||||
|
http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[*`light_portuguese`*],
|
||||||
|
http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`],
|
||||||
|
http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[`portuguese_rslp`] coming[1.3.0]
|
||||||
|
|
||||||
|
|
||||||
|
Romanian::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/romanian/stemmer.html[*`romanian`*]
|
||||||
|
|
||||||
|
Russian::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/russian/stemmer.html[*`russian`*],
|
||||||
|
http://doc.rero.ch/lm.php?url=1000%2C43%2C4%2C20091209094227-CA%2FDolamic_Ljiljana_-_Indexing_and_Searching_Strategies_for_the_Russian_20091209.pdf[`light_russian`]
|
||||||
|
|
||||||
|
Spanish::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/spanish/stemmer.html[`spanish`],
|
||||||
|
http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[*`light_spanish`*]
|
||||||
|
|
||||||
|
Swedish::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/swedish/stemmer.html[*`swedish`*],
|
||||||
|
http://clef.isti.cnr.it/2003/WN_web/22.pdf[`light_swedish`]
|
||||||
|
|
||||||
|
Turkish::
|
||||||
|
|
||||||
|
http://snowball.tartarus.org/algorithms/turkish/stemmer.html[*`turkish`*]
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,8 @@ import org.apache.lucene.analysis.pt.PortugueseStemFilter;
|
||||||
import org.apache.lucene.analysis.ru.RussianLightStemFilter;
|
import org.apache.lucene.analysis.ru.RussianLightStemFilter;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
|
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||||
import org.elasticsearch.common.Strings;
|
import org.elasticsearch.common.Strings;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
@ -69,6 +71,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
final Version indexVersion = indexSettings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT);
|
||||||
|
|
||||||
if ("arabic".equalsIgnoreCase(language)) {
|
if ("arabic".equalsIgnoreCase(language)) {
|
||||||
return new ArabicStemFilter(tokenStream);
|
return new ArabicStemFilter(tokenStream);
|
||||||
} else if ("armenian".equalsIgnoreCase(language)) {
|
} else if ("armenian".equalsIgnoreCase(language)) {
|
||||||
|
@ -85,90 +89,129 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
return new CzechStemFilter(tokenStream);
|
return new CzechStemFilter(tokenStream);
|
||||||
} else if ("danish".equalsIgnoreCase(language)) {
|
} else if ("danish".equalsIgnoreCase(language)) {
|
||||||
return new SnowballFilter(tokenStream, new DanishStemmer());
|
return new SnowballFilter(tokenStream, new DanishStemmer());
|
||||||
|
|
||||||
|
// Dutch stemmers
|
||||||
} else if ("dutch".equalsIgnoreCase(language)) {
|
} else if ("dutch".equalsIgnoreCase(language)) {
|
||||||
return new SnowballFilter(tokenStream, new DutchStemmer());
|
return new SnowballFilter(tokenStream, new DutchStemmer());
|
||||||
} else if ("english".equalsIgnoreCase(language)) {
|
} else if ("dutch_kp".equalsIgnoreCase(language) || "dutchKp".equalsIgnoreCase(language) || "kp".equalsIgnoreCase(language)) {
|
||||||
return new SnowballFilter(tokenStream, new EnglishStemmer());
|
|
||||||
} else if ("finnish".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new FinnishStemmer());
|
|
||||||
} else if ("french".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new FrenchStemmer());
|
|
||||||
} else if ("german".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new GermanStemmer());
|
|
||||||
} else if ("german2".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new German2Stemmer());
|
|
||||||
} else if ("hungarian".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new HungarianStemmer());
|
|
||||||
} else if ("italian".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new ItalianStemmer());
|
|
||||||
} else if ("kp".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new KpStemmer());
|
return new SnowballFilter(tokenStream, new KpStemmer());
|
||||||
} else if ("kstem".equalsIgnoreCase(language)) {
|
|
||||||
|
// English stemmers
|
||||||
|
} else if ("english".equalsIgnoreCase(language)) {
|
||||||
|
if (indexVersion.onOrAfter(Version.V_1_3_0)) {
|
||||||
|
return new PorterStemFilter(tokenStream);
|
||||||
|
} else {
|
||||||
|
return new SnowballFilter(tokenStream, new EnglishStemmer());
|
||||||
|
}
|
||||||
|
} else if ("light_english".equalsIgnoreCase(language) || "lightEnglish".equalsIgnoreCase(language)
|
||||||
|
|| "kstem".equalsIgnoreCase(language)) {
|
||||||
return new KStemFilter(tokenStream);
|
return new KStemFilter(tokenStream);
|
||||||
} else if ("lovins".equalsIgnoreCase(language)) {
|
} else if ("lovins".equalsIgnoreCase(language)) {
|
||||||
return new SnowballFilter(tokenStream, new LovinsStemmer());
|
return new SnowballFilter(tokenStream, new LovinsStemmer());
|
||||||
} else if ("latvian".equalsIgnoreCase(language)) {
|
|
||||||
return new LatvianStemFilter(tokenStream);
|
|
||||||
} else if ("norwegian".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new NorwegianStemmer());
|
|
||||||
} else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) {
|
|
||||||
return new NorwegianMinimalStemFilter(tokenStream);
|
|
||||||
} else if ("porter".equalsIgnoreCase(language)) {
|
} else if ("porter".equalsIgnoreCase(language)) {
|
||||||
return new PorterStemFilter(tokenStream);
|
return new PorterStemFilter(tokenStream);
|
||||||
} else if ("porter2".equalsIgnoreCase(language)) {
|
} else if ("porter2".equalsIgnoreCase(language)) {
|
||||||
|
if (indexVersion.onOrAfter(Version.V_1_3_0)) {
|
||||||
|
return new SnowballFilter(tokenStream, new EnglishStemmer());
|
||||||
|
} else {
|
||||||
return new SnowballFilter(tokenStream, new PorterStemmer());
|
return new SnowballFilter(tokenStream, new PorterStemmer());
|
||||||
} else if ("portuguese".equalsIgnoreCase(language)) {
|
}
|
||||||
return new SnowballFilter(tokenStream, new PortugueseStemmer());
|
|
||||||
} else if ("romanian".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new RomanianStemmer());
|
|
||||||
} else if ("russian".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new RussianStemmer());
|
|
||||||
} else if ("spanish".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new SpanishStemmer());
|
|
||||||
} else if ("swedish".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new SwedishStemmer());
|
|
||||||
} else if ("turkish".equalsIgnoreCase(language)) {
|
|
||||||
return new SnowballFilter(tokenStream, new TurkishStemmer());
|
|
||||||
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
|
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
|
||||||
return new EnglishMinimalStemFilter(tokenStream);
|
return new EnglishMinimalStemFilter(tokenStream);
|
||||||
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
|
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
|
||||||
return new EnglishPossessiveFilter(version, tokenStream);
|
return new EnglishPossessiveFilter(version, tokenStream);
|
||||||
|
|
||||||
|
// Finnish stemmers
|
||||||
|
} else if ("finnish".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new FinnishStemmer());
|
||||||
} else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) {
|
} else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) {
|
||||||
// leaving this for backward compatibility
|
// leaving this for backward compatibility
|
||||||
return new FinnishLightStemFilter(tokenStream);
|
return new FinnishLightStemFilter(tokenStream);
|
||||||
} else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) {
|
} else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) {
|
||||||
return new FinnishLightStemFilter(tokenStream);
|
return new FinnishLightStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// French stemmers
|
||||||
|
} else if ("french".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new FrenchStemmer());
|
||||||
} else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) {
|
} else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) {
|
||||||
return new FrenchLightStemFilter(tokenStream);
|
return new FrenchLightStemFilter(tokenStream);
|
||||||
} else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) {
|
} else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) {
|
||||||
return new FrenchMinimalStemFilter(tokenStream);
|
return new FrenchMinimalStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// German stemmers
|
||||||
|
} else if ("german".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new GermanStemmer());
|
||||||
|
} else if ("german2".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new German2Stemmer());
|
||||||
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
|
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
|
||||||
return new GermanLightStemFilter(tokenStream);
|
return new GermanLightStemFilter(tokenStream);
|
||||||
} else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) {
|
} else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) {
|
||||||
return new GermanMinimalStemFilter(tokenStream);
|
return new GermanMinimalStemFilter(tokenStream);
|
||||||
|
|
||||||
|
} else if ("greek".equalsIgnoreCase(language)) {
|
||||||
|
return new GreekStemFilter(tokenStream);
|
||||||
} else if ("hindi".equalsIgnoreCase(language)) {
|
} else if ("hindi".equalsIgnoreCase(language)) {
|
||||||
return new HindiStemFilter(tokenStream);
|
return new HindiStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// Hungarian stemmers
|
||||||
|
} else if ("hungarian".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new HungarianStemmer());
|
||||||
} else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) {
|
} else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) {
|
||||||
return new HungarianLightStemFilter(tokenStream);
|
return new HungarianLightStemFilter(tokenStream);
|
||||||
|
|
||||||
} else if ("indonesian".equalsIgnoreCase(language)) {
|
} else if ("indonesian".equalsIgnoreCase(language)) {
|
||||||
return new IndonesianStemFilter(tokenStream);
|
return new IndonesianStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// Italian stemmers
|
||||||
|
} else if ("italian".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new ItalianStemmer());
|
||||||
} else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) {
|
} else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) {
|
||||||
return new ItalianLightStemFilter(tokenStream);
|
return new ItalianLightStemFilter(tokenStream);
|
||||||
|
|
||||||
|
} else if ("latvian".equalsIgnoreCase(language)) {
|
||||||
|
return new LatvianStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// Norwegian stemmers
|
||||||
|
} else if ("norwegian".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new NorwegianStemmer());
|
||||||
|
} else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) {
|
||||||
|
return new NorwegianMinimalStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// Portuguese stemmers
|
||||||
|
} else if ("portuguese".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new PortugueseStemmer());
|
||||||
} else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) {
|
} else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) {
|
||||||
return new PortugueseLightStemFilter(tokenStream);
|
return new PortugueseLightStemFilter(tokenStream);
|
||||||
} else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) {
|
} else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) {
|
||||||
return new PortugueseMinimalStemFilter(tokenStream);
|
return new PortugueseMinimalStemFilter(tokenStream);
|
||||||
} else if ("portuguese".equalsIgnoreCase(language)) {
|
} else if ("portuguese_rslp".equalsIgnoreCase(language)) {
|
||||||
return new PortugueseStemFilter(tokenStream);
|
return new PortugueseStemFilter(tokenStream);
|
||||||
|
|
||||||
|
} else if ("romanian".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new RomanianStemmer());
|
||||||
|
|
||||||
|
// Russian stemmers
|
||||||
|
} else if ("russian".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new RussianStemmer());
|
||||||
} else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) {
|
} else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) {
|
||||||
return new RussianLightStemFilter(tokenStream);
|
return new RussianLightStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// Spanish stemmers
|
||||||
|
} else if ("spanish".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new SpanishStemmer());
|
||||||
} else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) {
|
} else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) {
|
||||||
return new SpanishLightStemFilter(tokenStream);
|
return new SpanishLightStemFilter(tokenStream);
|
||||||
|
|
||||||
|
// Swedish stemmers
|
||||||
|
} else if ("swedish".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new SwedishStemmer());
|
||||||
} else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) {
|
} else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) {
|
||||||
return new SwedishLightStemFilter(tokenStream);
|
return new SwedishLightStemFilter(tokenStream);
|
||||||
} else if ("greek".equalsIgnoreCase(language)) {
|
|
||||||
return new GreekStemFilter(tokenStream);
|
} else if ("turkish".equalsIgnoreCase(language)) {
|
||||||
|
return new SnowballFilter(tokenStream, new TurkishStemmer());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new SnowballFilter(tokenStream, language);
|
return new SnowballFilter(tokenStream, language);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween;
|
||||||
|
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_VERSION_CREATED;
|
||||||
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class StemmerTokenFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEnglishBackwardsCompatibility() throws IOException {
|
||||||
|
int iters = scaledRandomIntBetween(20, 100);
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
|
||||||
|
Version v = ElasticsearchTestCase.randomVersion(random());
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.my_english.type", "stemmer")
|
||||||
|
.put("index.analysis.filter.my_english.language", "english")
|
||||||
|
.put("index.analysis.analyzer.my_english.tokenizer","whitespace")
|
||||||
|
.put("index.analysis.analyzer.my_english.filter","my_english")
|
||||||
|
.put(SETTING_VERSION_CREATED,v)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||||
|
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_english");
|
||||||
|
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
|
||||||
|
TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo bar")));
|
||||||
|
NamedAnalyzer analyzer = analysisService.analyzer("my_english");
|
||||||
|
|
||||||
|
if (v.onOrAfter(Version.V_1_3_0)) {
|
||||||
|
assertThat(create, instanceOf(PorterStemFilter.class));
|
||||||
|
assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
|
||||||
|
} else {
|
||||||
|
assertThat(create, instanceOf(SnowballFilter.class));
|
||||||
|
assertAnalyzesTo(analyzer, "consolingly", new String[]{"consol"});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPorter2BackwardsCompatibility() throws IOException {
|
||||||
|
int iters = scaledRandomIntBetween(20, 100);
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
|
||||||
|
Version v = ElasticsearchTestCase.randomVersion(random());
|
||||||
|
Settings settings = ImmutableSettings.settingsBuilder()
|
||||||
|
.put("index.analysis.filter.my_porter2.type", "stemmer")
|
||||||
|
.put("index.analysis.filter.my_porter2.language", "porter2")
|
||||||
|
.put("index.analysis.analyzer.my_porter2.tokenizer","whitespace")
|
||||||
|
.put("index.analysis.analyzer.my_porter2.filter","my_porter2")
|
||||||
|
.put(SETTING_VERSION_CREATED,v)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||||
|
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_porter2");
|
||||||
|
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
|
||||||
|
TokenStream create = tokenFilter.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("foo bar")));
|
||||||
|
NamedAnalyzer analyzer = analysisService.analyzer("my_porter2");
|
||||||
|
assertThat(create, instanceOf(SnowballFilter.class));
|
||||||
|
|
||||||
|
if (v.onOrAfter(Version.V_1_3_0)) {
|
||||||
|
assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"});
|
||||||
|
} else {
|
||||||
|
assertAnalyzesTo(analyzer, "possibly", new String[]{"possibli"});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue