From b9a09c2b06a82c7690977be767a5a5cbb420e2e3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 2 Jul 2014 14:59:18 -0400 Subject: [PATCH] Analysis: Add additional Analyzers, Tokenizers, and TokenFilters from Lucene Add `irish` analyzer Add `sorani` analyzer (Kurdish) Add `classic` tokenizer: specific to english text and tries to recognize hostnames, companies, acronyms, etc. Add `thai` tokenizer: segments thai text into words. Add `classic` tokenfilter: cleans up acronyms and possessives from classic tokenizer Add `apostrophe` tokenfilter: removes text after apostrophe and the apostrophe itself Add `german_normalization` tokenfilter: umlaut/sharp S normalization Add `hindi_normalization` tokenfilter: accounts for hindi spelling differences Add `indic_normalization` tokenfilter: accounts for different unicode representations in Indian languages Add `sorani_normalization` tokenfilter: normalizes kurdish text Add `scandinavian_normalization` tokenfilter: normalizes Norwegian, Danish, Swedish text Add `scandinavian_folding` tokenfilter: much more aggressive form of `scandinavian_normalization` Add additional languages to stemmer tokenfilter: `galician`, `minimal_galician`, `irish`, `sorani`, `light_nynorsk`, `minimal_nynorsk` Add support access to default Thai stopword set "_thai_" Fix some bugs and broken links in documentation. Closes #5935 --- .../analysis/analyzers/lang-analyzer.asciidoc | 238 ++++++++++++++++-- docs/reference/analysis/tokenfilters.asciidoc | 4 + .../apostrophe-tokenfilter.asciidoc | 7 + .../tokenfilters/classic-tokenfilter.asciidoc | 11 + .../lowercase-tokenfilter.asciidoc | 2 +- .../normalization-tokenfilter.asciidoc | 37 ++- .../tokenfilters/stemmer-tokenfilter.asciidoc | 32 ++- docs/reference/analysis/tokenizers.asciidoc | 4 + .../tokenizers/classic-tokenizer.asciidoc | 21 ++ .../tokenizers/thai-tokenizer.asciidoc | 9 + .../index/analysis/Analysis.java | 6 + .../index/analysis/AnalysisModule.java | 13 + .../analysis/ApostropheFilterFactory.java | 44 ++++ .../index/analysis/ClassicFilterFactory.java | 44 ++++ .../analysis/ClassicTokenizerFactory.java | 52 ++++ .../GermanNormalizationFilterFactory.java | 44 ++++ .../HindiNormalizationFilterFactory.java | 44 ++++ .../IndicNormalizationFilterFactory.java | 44 ++++ .../index/analysis/IrishAnalyzerProvider.java | 50 ++++ .../analysis/LowerCaseTokenFilterFactory.java | 11 +- .../ScandinavianFoldingFilterFactory.java | 44 ++++ ...candinavianNormalizationFilterFactory.java | 44 ++++ .../analysis/SoraniAnalyzerProvider.java | 50 ++++ .../SoraniNormalizationFilterFactory.java | 44 ++++ .../analysis/StemmerTokenFilterFactory.java | 29 ++- .../index/analysis/ThaiTokenizerFactory.java | 46 ++++ .../indices/analysis/PreBuiltAnalyzers.java | 8 + .../index/analysis/AnalysisFactoryTests.java | 50 ++-- 28 files changed, 964 insertions(+), 68 deletions(-) create mode 100644 docs/reference/analysis/tokenfilters/apostrophe-tokenfilter.asciidoc create mode 100644 docs/reference/analysis/tokenfilters/classic-tokenfilter.asciidoc create mode 100644 docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc create mode 100644 docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc create mode 100644 src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/ClassicTokenizerFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/IrishAnalyzerProvider.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/SoraniAnalyzerProvider.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/ThaiTokenizerFactory.java diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc index 997ab75d995..6b6e6e63466 100644 --- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc @@ -23,12 +23,14 @@ following types are supported: <>, <>, <>, +<>, <>, <>, <>, <>, <>, <>, +<>, <>, <>, <>, @@ -42,8 +44,8 @@ more details. The following analyzers support setting custom `stem_exclusion` list: `arabic`, `armenian`, `basque`, `catalan`, `bulgarian`, `catalan`, `czech`, `finnish`, `dutch`, `english`, `finnish`, `french`, `galician`, -`german`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`, -`portuguese`, `romanian`, `russian`, `spanish`, `swedish`, `turkish`. +`german`, `irish`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`, +`portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`. [[arabic-analyzer]] ==== `arabic` analyzer @@ -720,7 +722,7 @@ The `german` analyzer could be reimplemented as a `custom` analyzer as follows: "lowercase", "german_stop", "german_keywords", - "ascii_folding", <3> + "german_normalization", "german_stemmer" ] } @@ -733,9 +735,6 @@ The `german` analyzer could be reimplemented as a `custom` analyzer as follows: or `stopwords_path` parameters. <2> Words can be excluded from stemming with the `stem_exclusion` parameter. -<3> The `german` analyzer actually uses the GermanNormalizationFilter, - which isn't exposed in Elasticsearch. The `ascii_folding` filter - does a similar job but is more extensive. [[greek-analyzer]] ==== `greek` analyzer @@ -752,6 +751,10 @@ The `greek` analyzer could be reimplemented as a `custom` analyzer as follows: "type": "stop", "stopwords": "_greek_" <1> }, + "greek_lowercase": { + "type": "lowercase", + "language": "greek" + }, "greek_keywords": { "type": "keyword_marker", "keywords": [] <2> @@ -765,7 +768,7 @@ The `greek` analyzer could be reimplemented as a `custom` analyzer as follows: "greek": { "tokenizer": "standard", "filter": [ - "lowercase", + "greek_lowercase", "greek_stop", "greek_keywords", "greek_stemmer" @@ -784,9 +787,48 @@ The `greek` analyzer could be reimplemented as a `custom` analyzer as follows: [[hindi-analyzer]] ==== `hindi` analyzer -The `hindi` analyzer cannot currently be implemented as a `custom` analyzer -as it depends on the IndicNormalizationFilter and HindiNormalizationFilter -which are not yet exposed by Elasticsearch. Instead, see the <>. +The `hindi` analyzer could be reimplemented as a `custom` analyzer as follows: + +[source,js] +---------------------------------------------------- +{ + "settings": { + "analysis": { + "filter": { + "hindi_stop": { + "type": "stop", + "stopwords": "_hindi_" <1> + }, + "hindi_keywords": { + "type": "keyword_marker", + "keywords": [] <2> + }, + "hindi_stemmer": { + "type": "stemmer", + "language": "hindi" + } + }, + "analyzer": { + "hindi": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "indic_normalization", + "hindi_normalization", + "hindi_stop", + "hindi_keywords", + "hindi_stemmer" + ] + } + } + } + } +} +---------------------------------------------------- +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. +<2> Words can be excluded from stemming with the `stem_exclusion` + parameter. [[hungarian-analyzer]] ==== `hungarian` analyzer @@ -877,6 +919,59 @@ The `indonesian` analyzer could be reimplemented as a `custom` analyzer as follo <2> Words can be excluded from stemming with the `stem_exclusion` parameter. +[[irish-analyzer]] +==== `irish` analyzer + +The `irish` analyzer could be reimplemented as a `custom` analyzer as follows: + +[source,js] +---------------------------------------------------- +{ + "settings": { + "analysis": { + "filter": { + "irish_elision": { + "type": "elision", + "articles": [ "h", "n", "t" ] + }, + "irish_stop": { + "type": "stop", + "stopwords": "_irish_" <1> + }, + "irish_lowercase": { + "type": "lowercase", + "language": "irish" + }, + "irish_keywords": { + "type": "keyword_marker", + "keywords": [] <2> + }, + "irish_stemmer": { + "type": "stemmer", + "language": "irish" + } + }, + "analyzer": { + "irish": { + "tokenizer": "standard", + "filter": [ + "irish_stop", + "irish_elision", + "irish_lowercase", + "irish_keywords", + "irish_stemmer" + ] + } + } + } + } +} +---------------------------------------------------- +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. +<2> Words can be excluded from stemming with the `stem_exclusion` + parameter. + [[italian-analyzer]] ==== `italian` analyzer @@ -1150,6 +1245,51 @@ The `russian` analyzer could be reimplemented as a `custom` analyzer as follows: <2> Words can be excluded from stemming with the `stem_exclusion` parameter. +[[sorani-analyzer]] +==== `sorani` analyzer + +The `sorani` analyzer could be reimplemented as a `custom` analyzer as follows: + +[source,js] +---------------------------------------------------- +{ + "settings": { + "analysis": { + "filter": { + "sorani_stop": { + "type": "stop", + "stopwords": "_sorani_" <1> + }, + "sorani_keywords": { + "type": "keyword_marker", + "keywords": [] <2> + }, + "sorani_stemmer": { + "type": "stemmer", + "language": "sorani" + } + }, + "analyzer": { + "sorani": { + "tokenizer": "standard", + "filter": [ + "sorani_normalization", + "lowercase", + "sorani_stop", + "sorani_keywords", + "sorani_stemmer" + ] + } + } + } + } +} +---------------------------------------------------- +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. +<2> Words can be excluded from stemming with the `stem_exclusion` + parameter. + [[spanish-analyzer]] ==== `spanish` analyzer @@ -1241,14 +1381,80 @@ The `swedish` analyzer could be reimplemented as a `custom` analyzer as follows: [[turkish-analyzer]] ==== `turkish` analyzer -The `turkish` analyzer cannot currently be implemented as a `custom` analyzer -because it depends on the TurkishLowerCaseFilter and the ApostropheFilter -which are not exposed in Elasticsearch. Instead, see the <>. +The `turkish` analyzer could be reimplemented as a `custom` analyzer as follows: + +[source,js] +---------------------------------------------------- +{ + "settings": { + "analysis": { + "filter": { + "turkish_stop": { + "type": "stop", + "stopwords": "_turkish_" <1> + }, + "turkish_lowercase": { + "type": "lowercase", + "language": "turkish" + }, + "turkish_keywords": { + "type": "keyword_marker", + "keywords": [] <2> + }, + "turkish_stemmer": { + "type": "stemmer", + "language": "turkish" + } + }, + "analyzer": { + "turkish": { + "tokenizer": "standard", + "filter": [ + "apostrophe", + "turkish_lowercase", + "turkish_stop", + "turkish_keywords", + "turkish_stemmer" + ] + } + } + } + } +} +---------------------------------------------------- +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. +<2> Words can be excluded from stemming with the `stem_exclusion` + parameter. [[thai-analyzer]] ==== `thai` analyzer -The `thai` analyzer cannot currently be implemented as a `custom` analyzer -because it depends on the ThaiTokenizer which is not exposed in Elasticsearch. -Instead, see the <>. +The `thai` analyzer could be reimplemented as a `custom` analyzer as follows: +[source,js] +---------------------------------------------------- +{ + "settings": { + "analysis": { + "filter": { + "thai_stop": { + "type": "stop", + "stopwords": "_thai_" <1> + } + }, + "analyzer": { + "thai": { + "tokenizer": "thai", + "filter": [ + "lowercase", + "thai_stop" + ] + } + } + } + } +} +---------------------------------------------------- +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index b33356d3c0b..ec46c26de8e 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -78,3 +78,7 @@ include::tokenfilters/cjk-bigram-tokenfilter.asciidoc[] include::tokenfilters/delimited-payload-tokenfilter.asciidoc[] include::tokenfilters/keep-words-tokenfilter.asciidoc[] + +include::tokenfilters/classic-tokenfilter.asciidoc[] + +include::tokenfilters/apostrophe-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/apostrophe-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/apostrophe-tokenfilter.asciidoc new file mode 100644 index 00000000000..ffef89bdb54 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/apostrophe-tokenfilter.asciidoc @@ -0,0 +1,7 @@ +[[analysis-apostrophe-tokenfilter]] +=== Apostrophe Token Filter + +coming[1.3.0] + +The `apostrophe` token filter strips all characters after an apostrophe, +including the apostrophe itself. diff --git a/docs/reference/analysis/tokenfilters/classic-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/classic-tokenfilter.asciidoc new file mode 100644 index 00000000000..e3810be2f12 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/classic-tokenfilter.asciidoc @@ -0,0 +1,11 @@ +[[analysis-classic-tokenfilter]] +=== Classic Token Filter + +coming[1.3.0] + +The `classic` token filter does optional post-processing of +terms that are generated by the <>. + +This filter removes the english possessive from the end of words, and +it removes dots from acronyms. + diff --git a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc index 857c0d7916a..0a2ea95984b 100644 --- a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc @@ -4,7 +4,7 @@ A token filter of type `lowercase` that normalizes token text to lower case. -Lowercase token filter supports Greek and Turkish lowercase token +Lowercase token filter supports Greek, Irish coming[1.3.0], and Turkish lowercase token filters through the `language` parameter. Below is a usage example in a custom analyzer diff --git a/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc index 875187242ec..fcba073e5cf 100644 --- a/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc @@ -4,12 +4,33 @@ There are several token filters available which try to normalize special characters of a certain language. -You can currently choose between `arabic_normalization` and -`persian_normalization` normalization in your token filter -configuration. For more information check the -http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html[ArabicNormalizer] -or the -http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html[PersianNormalizer] -documentation. +[horizontal] +Arabic:: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html[`arabic_normalization`] + +German:: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/de/GermanNormalizationFilter.html[`german_normalization`] coming[1.3.0] + +Hindi:: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/hi/HindiNormalizer.html[`hindi_normalization`] coming[1.3.0] + +Indic:: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/in/IndicNormalizer.html[`indic_normalization`] coming[1.3.0] + +Kurdish (Sorani):: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniNormalizer.html[`sorani_normalization`] coming[1.3.0] + +Persian:: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html[`persian_normalization`] + +Scandinavian:: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianNormalizationFilter.html[`scandinavian_normalization`] coming[1.3.0] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ScandinavianFoldingFilter.html[`scandinavian_folding`] coming[1.3.0] -*Note:* These filters are available since `0.90.2` diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index 5c5a19813dd..001926bfb28 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -32,7 +32,7 @@ available values (the preferred filters are marked in *bold*): [horizontal] Arabic:: -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[*`arabic`*] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ar/ArabicStemmer.html[*`arabic`*] Armenian:: @@ -44,7 +44,7 @@ http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*] Brazilian Portuguese:: -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[*`brazilian`*] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*] Bulgarian:: @@ -72,7 +72,7 @@ English:: http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*] coming[1.3.0,Returns the <> instead of the <>], http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`] coming[1.3.0,Returns the <>], http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[`minimal_english`], -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[`possessive_english`], +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/en/EnglishPossessiveFilter.html[`possessive_english`], http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`] coming[1.3.0,Returns the <> instead of the <>], http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`] @@ -87,6 +87,11 @@ http://snowball.tartarus.org/algorithms/french/stemmer.html[`french`], http://dl.acm.org/citation.cfm?id=1141523[*`light_french`*], http://dl.acm.org/citation.cfm?id=318984[`minimal_french`] +Galician:: + +http://bvg.udc.es/recursos_lingua/stemming.jsp[*`galician`*] coming[1.3.0], +http://bvg.udc.es/recursos_lingua/stemming.jsp[`minimal_galician`] (Plural step only) coming[1.3.0] + German:: http://snowball.tartarus.org/algorithms/german/stemmer.html[`german`], @@ -111,19 +116,33 @@ Indonesian:: http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[*`indonesian`*] +Irish:: + +http://snowball.tartarus.org/otherapps/oregan/intro.html[*`irish`*] + Italian:: http://snowball.tartarus.org/algorithms/italian/stemmer.html[`italian`], http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[*`light_italian`*] +Kurdish (Sorani):: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/ckb/SoraniStemmer.html[*`sorani`*] coming[1.3.0] + Latvian:: -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[*`latvian`*] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/lv/LatvianStemmer.html[*`latvian`*] -Norwegian:: +Norwegian (Bokmål):: http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[*`norwegian`*], -http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[`minimal_norwegian`] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianLightStemmer.html[*`light_norwegian`*] coming[1.3.0] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.html[`minimal_norwegian`] + +Norwegian (Nynorsk):: + +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianLightStemmer.html[*`light_nynorsk`*] coming[1.3.0] +http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/no/NorwegianMinimalStemmer.html[`minimal_nynorsk`] coming[1.3.0] Portuguese:: @@ -132,7 +151,6 @@ http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN= http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`], http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[`portuguese_rslp`] coming[1.3.0] - Romanian:: http://snowball.tartarus.org/algorithms/romanian/stemmer.html[*`romanian`*] diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index 3118b0dfb2b..46c02f9a4fc 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -28,3 +28,7 @@ include::tokenizers/uaxurlemail-tokenizer.asciidoc[] include::tokenizers/pathhierarchy-tokenizer.asciidoc[] +include::tokenizers/classic-tokenizer.asciidoc[] + +include::tokenizers/thai-tokenizer.asciidoc[] + diff --git a/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc new file mode 100644 index 00000000000..301b005d609 --- /dev/null +++ b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc @@ -0,0 +1,21 @@ +[[analysis-classic-tokenizer]] +=== Classic Tokenizer + +coming[1.3.0] + +A tokenizer of type `classic` providing grammar based tokenizer that is +a good tokenizer for English language documents. This tokenizer has +heuristics for special treatment of acronyms, company names, email addresses, +and internet host names. However, these rules don't always work, and +the tokenizer doesn't work well for most languages other than English. + +The following are settings that can be set for a `classic` tokenizer +type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`max_token_length` |The maximum token length. If a token is seen that +exceeds this length then it is discarded. Defaults to `255`. +|======================================================================= + diff --git a/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc new file mode 100644 index 00000000000..e60ced021bc --- /dev/null +++ b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc @@ -0,0 +1,9 @@ +[[analysis-thai-tokenizer]] +=== Thai Tokenizer + +coming[1.3.0] + +A tokenizer of type `thai` that segments Thai text into words. This tokenizer +uses the built-in Thai segmentation algorithm included with Java to divide +up Thai text. Text in other languages in general will be treated the same +as `standard`. diff --git a/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/src/main/java/org/elasticsearch/index/analysis/Analysis.java index cc55c7a371e..4d10bef1660 100644 --- a/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer; import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.ca.CatalanAnalyzer; +import org.apache.lucene.analysis.ckb.SoraniAnalyzer; import org.apache.lucene.analysis.cz.CzechAnalyzer; import org.apache.lucene.analysis.da.DanishAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; @@ -38,6 +39,7 @@ import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.fi.FinnishAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.ga.IrishAnalyzer; import org.apache.lucene.analysis.gl.GalicianAnalyzer; import org.apache.lucene.analysis.hi.HindiAnalyzer; import org.apache.lucene.analysis.hu.HungarianAnalyzer; @@ -50,6 +52,7 @@ import org.apache.lucene.analysis.pt.PortugueseAnalyzer; import org.apache.lucene.analysis.ro.RomanianAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.sv.SwedishAnalyzer; +import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; @@ -134,14 +137,17 @@ public class Analysis { .put("_hindi_", HindiAnalyzer.getDefaultStopSet()) .put("_hungarian_", HungarianAnalyzer.getDefaultStopSet()) .put("_indonesian_", IndonesianAnalyzer.getDefaultStopSet()) + .put("_irish_", IrishAnalyzer.getDefaultStopSet()) .put("_italian_", ItalianAnalyzer.getDefaultStopSet()) .put("_norwegian_", NorwegianAnalyzer.getDefaultStopSet()) .put("_persian_", PersianAnalyzer.getDefaultStopSet()) .put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet()) .put("_romanian_", RomanianAnalyzer.getDefaultStopSet()) .put("_russian_", RussianAnalyzer.getDefaultStopSet()) + .put("_sorani_", SoraniAnalyzer.getDefaultStopSet()) .put("_spanish_", SpanishAnalyzer.getDefaultStopSet()) .put("_swedish_", SwedishAnalyzer.getDefaultStopSet()) + .put("_thai_", ThaiAnalyzer.getDefaultStopSet()) .put("_turkish_", TurkishAnalyzer.getDefaultStopSet()) .immutableMap(); diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 45b8e8f3d7a..2b65c17ac7d 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -503,11 +503,20 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("stemmer_override", StemmerOverrideTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("arabic_normalization", ArabicNormalizationFilterFactory.class); + tokenFiltersBindings.processTokenFilter("german_normalization", GermanNormalizationFilterFactory.class); + tokenFiltersBindings.processTokenFilter("hindi_normalization", HindiNormalizationFilterFactory.class); + tokenFiltersBindings.processTokenFilter("indic_normalization", IndicNormalizationFilterFactory.class); + tokenFiltersBindings.processTokenFilter("sorani_normalization", SoraniNormalizationFilterFactory.class); tokenFiltersBindings.processTokenFilter("persian_normalization", PersianNormalizationFilterFactory.class); + tokenFiltersBindings.processTokenFilter("scandinavian_normalization", ScandinavianNormalizationFilterFactory.class); + tokenFiltersBindings.processTokenFilter("scandinavian_folding", ScandinavianFoldingFilterFactory.class); tokenFiltersBindings.processTokenFilter("hunspell", HunspellTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("cjk_bigram", CJKBigramFilterFactory.class); tokenFiltersBindings.processTokenFilter("cjk_width", CJKWidthFilterFactory.class); + + tokenFiltersBindings.processTokenFilter("apostrophe", ApostropheFilterFactory.class); + tokenFiltersBindings.processTokenFilter("classic", ClassicFilterFactory.class); } @@ -515,6 +524,8 @@ public class AnalysisModule extends AbstractModule { @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { tokenizersBindings.processTokenizer("pattern", PatternTokenizerFactory.class); + tokenizersBindings.processTokenizer("classic", ClassicTokenizerFactory.class); + tokenizersBindings.processTokenizer("thai", ThaiTokenizerFactory.class); } @Override @@ -542,6 +553,7 @@ public class AnalysisModule extends AbstractModule { analyzersBindings.processAnalyzer("hindi", HindiAnalyzerProvider.class); analyzersBindings.processAnalyzer("hungarian", HungarianAnalyzerProvider.class); analyzersBindings.processAnalyzer("indonesian", IndonesianAnalyzerProvider.class); + analyzersBindings.processAnalyzer("irish", IrishAnalyzerProvider.class); analyzersBindings.processAnalyzer("italian", ItalianAnalyzerProvider.class); analyzersBindings.processAnalyzer("latvian", LatvianAnalyzerProvider.class); analyzersBindings.processAnalyzer("norwegian", NorwegianAnalyzerProvider.class); @@ -549,6 +561,7 @@ public class AnalysisModule extends AbstractModule { analyzersBindings.processAnalyzer("portuguese", PortugueseAnalyzerProvider.class); analyzersBindings.processAnalyzer("romanian", RomanianAnalyzerProvider.class); analyzersBindings.processAnalyzer("russian", RussianAnalyzerProvider.class); + analyzersBindings.processAnalyzer("sorani", SoraniAnalyzerProvider.class); analyzersBindings.processAnalyzer("spanish", SpanishAnalyzerProvider.class); analyzersBindings.processAnalyzer("swedish", SwedishAnalyzerProvider.class); analyzersBindings.processAnalyzer("turkish", TurkishAnalyzerProvider.class); diff --git a/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java new file mode 100644 index 00000000000..614e860735a --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tr.ApostropheFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link ApostropheFilter} + */ +public class ApostropheFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public ApostropheFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ApostropheFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java new file mode 100644 index 00000000000..7f899c4b846 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.ClassicFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link ClassicFilter} + */ +public class ClassicFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public ClassicFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ClassicFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/ClassicTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/ClassicTokenizerFactory.java new file mode 100644 index 00000000000..88664ce4f45 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ClassicTokenizerFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; + +/** + * Factory for {@link ClassicTokenizer} + */ +public class ClassicTokenizerFactory extends AbstractTokenizerFactory { + + private final int maxTokenLength; + + @Inject + public ClassicTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); + } + + @Override + public Tokenizer create(Reader reader) { + ClassicTokenizer tokenizer = new ClassicTokenizer(version, reader); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java new file mode 100644 index 00000000000..6fe4a6b648f --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanNormalizationFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link GermanNormalizationFilter} + */ +public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public GermanNormalizationFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new GermanNormalizationFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java new file mode 100644 index 00000000000..563f141b8c7 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hi.HindiNormalizationFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link HindiNormalizationFilter} + */ +public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public HindiNormalizationFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new HindiNormalizationFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java new file mode 100644 index 00000000000..e834ac67aec --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.in.IndicNormalizationFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link IndicNormalizationFilter} + */ +public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public IndicNormalizationFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new IndicNormalizationFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/IrishAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/IrishAnalyzerProvider.java new file mode 100644 index 00000000000..0b8bdb2b0c1 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/IrishAnalyzerProvider.java @@ -0,0 +1,50 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.ga.IrishAnalyzer; +import org.apache.lucene.analysis.util.CharArraySet; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Provider for {@link IrishAnalyzer} + */ +public class IrishAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final IrishAnalyzer analyzer; + + @Inject + public IrishAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + analyzer = new IrishAnalyzer(version, + Analysis.parseStopWords(env, settings, IrishAnalyzer.getDefaultStopSet(), version), + Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET, version)); + } + + @Override + public IrishAnalyzer get() { + return this.analyzer; + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java index 9dc7f660bd1..0d62a288a3c 100644 --- a/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java @@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.el.GreekLowerCaseFilter; +import org.apache.lucene.analysis.ga.IrishLowerCaseFilter; import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; @@ -31,7 +32,13 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettings; /** - * + * Factory for {@link LowerCaseFilter} and some language-specific variants + * supported by the {@code language} parameter: + *
    + *
  • greek: {@link GreekLowerCaseFilter} + *
  • irish: {@link IrishLowerCaseFilter} + *
  • turkish: {@link TurkishLowerCaseFilter} + *
*/ public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory { @@ -49,6 +56,8 @@ public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory { return new LowerCaseFilter(version, tokenStream); } else if (lang.equalsIgnoreCase("greek")) { return new GreekLowerCaseFilter(version, tokenStream); + } else if (lang.equalsIgnoreCase("irish")) { + return new IrishLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("turkish")) { return new TurkishLowerCaseFilter(tokenStream); } else { diff --git a/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java new file mode 100644 index 00000000000..b20e344cf22 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link ScandinavianFoldingFilter} + */ +public class ScandinavianFoldingFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public ScandinavianFoldingFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ScandinavianFoldingFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java new file mode 100644 index 00000000000..da0eb6e432d --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link ScandinavianNormalizationFilter} + */ +public class ScandinavianNormalizationFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public ScandinavianNormalizationFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new ScandinavianNormalizationFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/SoraniAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/SoraniAnalyzerProvider.java new file mode 100644 index 00000000000..c1ba8a6cd3d --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/SoraniAnalyzerProvider.java @@ -0,0 +1,50 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.ckb.SoraniAnalyzer; +import org.apache.lucene.analysis.util.CharArraySet; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Provider for {@link SoraniAnalyzer} + */ +public class SoraniAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final SoraniAnalyzer analyzer; + + @Inject + public SoraniAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + analyzer = new SoraniAnalyzer(version, + Analysis.parseStopWords(env, settings, SoraniAnalyzer.getDefaultStopSet(), version), + Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET, version)); + } + + @Override + public SoraniAnalyzer get() { + return this.analyzer; + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java new file mode 100644 index 00000000000..8656c0476af --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * Factory for {@link SoraniNormalizationFilter} + */ +public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public SoraniNormalizationFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new SoraniNormalizationFilter(tokenStream); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java index a260e985204..f9861fd66fd 100644 --- a/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.bg.BulgarianStemFilter; import org.apache.lucene.analysis.br.BrazilianStemFilter; +import org.apache.lucene.analysis.ckb.SoraniStemFilter; import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.de.GermanLightStemFilter; import org.apache.lucene.analysis.de.GermanMinimalStemFilter; @@ -35,11 +36,15 @@ import org.apache.lucene.analysis.es.SpanishLightStemFilter; import org.apache.lucene.analysis.fi.FinnishLightStemFilter; import org.apache.lucene.analysis.fr.FrenchLightStemFilter; import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; +import org.apache.lucene.analysis.gl.GalicianMinimalStemFilter; +import org.apache.lucene.analysis.gl.GalicianStemFilter; import org.apache.lucene.analysis.hi.HindiStemFilter; import org.apache.lucene.analysis.hu.HungarianLightStemFilter; import org.apache.lucene.analysis.id.IndonesianStemFilter; import org.apache.lucene.analysis.it.ItalianLightStemFilter; import org.apache.lucene.analysis.lv.LatvianStemFilter; +import org.apache.lucene.analysis.no.NorwegianLightStemFilter; +import org.apache.lucene.analysis.no.NorwegianLightStemmer; import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter; import org.apache.lucene.analysis.pt.PortugueseLightStemFilter; import org.apache.lucene.analysis.pt.PortugueseMinimalStemFilter; @@ -137,6 +142,12 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { return new FrenchLightStemFilter(tokenStream); } else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) { return new FrenchMinimalStemFilter(tokenStream); + + // Galician stemmers + } else if ("galician".equalsIgnoreCase(language)) { + return new GalicianStemFilter(tokenStream); + } else if ("minimal_galician".equalsIgnoreCase(language)) { + return new GalicianMinimalStemFilter(tokenStream); // German stemmers } else if ("german".equalsIgnoreCase(language)) { @@ -161,6 +172,10 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { } else if ("indonesian".equalsIgnoreCase(language)) { return new IndonesianStemFilter(tokenStream); + + // Irish stemmer + } else if ("irish".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new IrishStemmer()); // Italian stemmers } else if ("italian".equalsIgnoreCase(language)) { @@ -171,11 +186,19 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { } else if ("latvian".equalsIgnoreCase(language)) { return new LatvianStemFilter(tokenStream); - // Norwegian stemmers + // Norwegian (Bokmål) stemmers } else if ("norwegian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new NorwegianStemmer()); + } else if ("light_norwegian".equalsIgnoreCase(language) || "lightNorwegian".equalsIgnoreCase(language)) { + return new NorwegianLightStemFilter(tokenStream); } else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) { return new NorwegianMinimalStemFilter(tokenStream); + + // Norwegian (Nynorsk) stemmers + } else if ("light_nynorsk".equalsIgnoreCase(language) || "lightNynorsk".equalsIgnoreCase(language)) { + return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); + } else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) { + return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); // Portuguese stemmers } else if ("portuguese".equalsIgnoreCase(language)) { @@ -201,6 +224,10 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { return new SnowballFilter(tokenStream, new SpanishStemmer()); } else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) { return new SpanishLightStemFilter(tokenStream); + + // Sorani Kurdish stemmer + } else if ("sorani".equalsIgnoreCase(language)) { + return new SoraniStemFilter(tokenStream); // Swedish stemmers } else if ("swedish".equalsIgnoreCase(language)) { diff --git a/src/main/java/org/elasticsearch/index/analysis/ThaiTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/ThaiTokenizerFactory.java new file mode 100644 index 00000000000..a23d3219652 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/ThaiTokenizerFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.th.ThaiTokenizer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; + +/** + * Factory for {@link ThaiTokenizer} + */ +public class ThaiTokenizerFactory extends AbstractTokenizerFactory { + + @Inject + public ThaiTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public Tokenizer create(Reader reader) { + return new ThaiTokenizer(reader); + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java index a5aae82d849..5cf7ecf0682 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.bg.BulgarianAnalyzer; import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.ca.CatalanAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; +import org.apache.lucene.analysis.ckb.SoraniAnalyzer; import org.apache.lucene.analysis.cn.ChineseAnalyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.core.SimpleAnalyzer; @@ -348,6 +349,13 @@ public enum PreBuiltAnalyzers { return new RussianAnalyzer(version.luceneVersion); } }, + + SORANI { + @Override + protected Analyzer create(Version version) { + return new SoraniAnalyzer(version.luceneVersion); + } + }, SPANISH { @Override diff --git a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java index e2ce0566905..aff1fd51452 100644 --- a/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java @@ -43,6 +43,7 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("russianletter", Deprecated.class); // exposed in ES + put("classic", ClassicTokenizerFactory.class); put("edgengram", EdgeNGramTokenizerFactory.class); put("keyword", KeywordTokenizerFactory.class); put("letter", LetterTokenizerFactory.class); @@ -51,16 +52,10 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("pathhierarchy", PathHierarchyTokenizerFactory.class); put("pattern", PatternTokenizerFactory.class); put("standard", StandardTokenizerFactory.class); + put("thai", ThaiTokenizerFactory.class); put("uax29urlemail", UAX29URLEmailTokenizerFactory.class); put("whitespace", WhitespaceTokenizerFactory.class); - - // TODO: these tokenizers are not yet exposed: useful? - // historical version of standardtokenizer... tries to recognize - // company names and a few other things. not good for asian languages etc. - put("classic", Void.class); - // we should add this, the thaiwordfilter is deprecated. this one has correct offsets - put("thai", Void.class); // this one "seems to mess up offsets". probably shouldn't be a tokenizer... put("wikipedia", Void.class); }}; @@ -80,6 +75,7 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { // exposed in ES + put("apostrophe", ApostropheFilterFactory.class); put("arabicnormalization", ArabicNormalizationFilterFactory.class); put("arabicstem", ArabicStemTokenFilterFactory.class); put("asciifolding", ASCIIFoldingTokenFilterFactory.class); @@ -87,6 +83,7 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("bulgarianstem", StemmerTokenFilterFactory.class); put("cjkbigram", CJKBigramFilterFactory.class); put("cjkwidth", CJKWidthFilterFactory.class); + put("classic", ClassicFilterFactory.class); put("commongrams", CommonGramsTokenFilterFactory.class); put("commongramsquery", CommonGramsTokenFilterFactory.class); put("czechstem", CzechStemTokenFilterFactory.class); @@ -99,16 +96,21 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("finnishlightstem", StemmerTokenFilterFactory.class); put("frenchlightstem", StemmerTokenFilterFactory.class); put("frenchminimalstem", StemmerTokenFilterFactory.class); + put("galicianminimalstem", StemmerTokenFilterFactory.class); + put("galicianstem", StemmerTokenFilterFactory.class); put("germanstem", GermanStemTokenFilterFactory.class); put("germanlightstem", StemmerTokenFilterFactory.class); put("germanminimalstem", StemmerTokenFilterFactory.class); + put("germannormalization", GermanNormalizationFilterFactory.class); put("greeklowercase", LowerCaseTokenFilterFactory.class); put("greekstem", StemmerTokenFilterFactory.class); - put("hindistem", StemmerTokenFilterFactory.class); + put("hindinormalization", HindiNormalizationFilterFactory.class); put("hindistem", StemmerTokenFilterFactory.class); put("hungarianlightstem", StemmerTokenFilterFactory.class); put("hunspellstem", HunspellTokenFilterFactory.class); put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class); + put("indicnormalization", IndicNormalizationFilterFactory.class); + put("irishlowercase", LowerCaseTokenFilterFactory.class); put("indonesianstem", StemmerTokenFilterFactory.class); put("italianlightstem", StemmerTokenFilterFactory.class); put("keepword", KeepWordFilterFactory.class); @@ -119,17 +121,23 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { put("limittokencount", LimitTokenCountFilterFactory.class); put("lowercase", LowerCaseTokenFilterFactory.class); put("ngram", NGramTokenFilterFactory.class); + put("norwegianlightstem", StemmerTokenFilterFactory.class); put("norwegianminimalstem", StemmerTokenFilterFactory.class); put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class); put("patternreplace", PatternReplaceTokenFilterFactory.class); put("persiannormalization", PersianNormalizationFilterFactory.class); put("porterstem", PorterStemTokenFilterFactory.class); + put("portuguesestem", StemmerTokenFilterFactory.class); put("portugueselightstem", StemmerTokenFilterFactory.class); put("portugueseminimalstem", StemmerTokenFilterFactory.class); put("reversestring", ReverseTokenFilterFactory.class); put("russianlightstem", StemmerTokenFilterFactory.class); + put("scandinavianfolding", ScandinavianFoldingFilterFactory.class); + put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class); put("shingle", ShingleTokenFilterFactory.class); put("snowballporter", SnowballTokenFilterFactory.class); + put("soraninormalization", SoraniNormalizationFilterFactory.class); + put("soranistem", StemmerTokenFilterFactory.class); put("spanishlightstem", StemmerTokenFilterFactory.class); put("standard", StandardTokenFilterFactory.class); put("stemmeroverride", StemmerOverrideTokenFilterFactory.class); @@ -144,46 +152,20 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase { // TODO: these tokenfilters are not yet exposed: useful? - // useful for turkish language - put("apostrophe", Void.class); // capitalizes tokens put("capitalization", Void.class); - // cleans up after classic tokenizer - put("classic", Void.class); // like length filter (but codepoints) put("codepointcount", Void.class); - // galician language stemmers - put("galicianminimalstem", Void.class); - put("galicianstem", Void.class); - // o+umlaut=oe type normalization for german - put("germannormalization", Void.class); - // hindi text normalization - put("hindinormalization", Void.class); // puts hyphenated words back together put("hyphenatedwords", Void.class); - // unicode normalization for indian languages - put("indicnormalization", Void.class); - // lowercasing for irish: add to LowerCase (has a stemmer, too) - put("irishlowercase", Void.class); // repeats anything marked as keyword put("keywordrepeat", Void.class); // like limittokencount, but by position put("limittokenposition", Void.class); // ??? put("numericpayload", Void.class); - // RSLP stemmer for portuguese - put("portuguesestem", Void.class); - // light stemming for norwegian (has nb/nn options too) - put("norwegianlightstem", Void.class); // removes duplicates at the same position (this should be used by the existing factory) put("removeduplicates", Void.class); - // accent handling for scandinavian languages - put("scandinavianfolding", Void.class); - // less aggressive accent handling for scandinavian languages - put("scandinaviannormalization", Void.class); - // kurdish language support - put("soraninormalization", Void.class); - put("soranistem", Void.class); // ??? put("tokenoffsetpayload", Void.class); // like a stop filter but by token-type