From 7d20b50f456508bcac559090ef2688b49e250f3e Mon Sep 17 00:00:00 2001 From: gpaimla Date: Mon, 18 Nov 2019 18:19:54 +0200 Subject: [PATCH] Implement Lucene EstonianAnalyzer, Stemmer (#49149) This PR adds a new analyzer and stemmer for the Estonian language. Closes #48895 --- .../analysis/analyzers/lang-analyzer.asciidoc | 49 +++++++++++++++++++ .../tokenfilters/stop-tokenfilter.asciidoc | 2 +- .../analysis/common/CommonAnalysisPlugin.java | 3 ++ .../common/EstonianAnalyzerProvider.java | 45 +++++++++++++++++ .../common/StemmerTokenFilterFactory.java | 4 ++ .../test/analysis-common/20_analyzers.yml | 29 +++++++++++ .../index/analysis/Analysis.java | 2 + 7 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EstonianAnalyzerProvider.java diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc index 3c1047d81ef..a57fef8d28e 100644 --- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc @@ -15,6 +15,7 @@ following types are supported: <>, <>, <>, +<>, <>, <>, <>, @@ -669,6 +670,54 @@ PUT /english_example // TEST[s/"english_keywords",//] // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/] +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. +<2> This filter should be removed unless there are words which should + be excluded from stemming. + +[[estonian-analyzer]] +===== `estonian` analyzer + +The `estonian` analyzer could be reimplemented as a `custom` analyzer as follows: + +[source,console] +---------------------------------------------------- +PUT /estonian_example +{ + "settings": { + "analysis": { + "filter": { + "estonian_stop": { + "type": "stop", + "stopwords": "_estonian_" <1> + }, + "estonian_keywords": { + "type": "keyword_marker", + "keywords": ["näide"] <2> + }, + "estonian_stemmer": { + "type": "stemmer", + "language": "estonian" + } + }, + "analyzer": { + "rebuilt_estonian": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "estonian_stop", + "estonian_keywords", + "estonian_stemmer" + ] + } + } + } + } +} +---------------------------------------------------- +// TEST[s/"estonian_keywords",//] +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: estonian_example, first: estonian, second: rebuilt_estonian}\nendyaml\n/] + <1> The default stopwords can be overridden with the `stopwords` or `stopwords_path` parameters. <2> This filter should be removed unless there are words which should diff --git a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc index d73d878696e..f4019fa1800 100644 --- a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc @@ -70,7 +70,7 @@ PUT /my_index Elasticsearch provides the following predefined list of languages: `_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`, -`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`, +`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_estonian_`, `_finnish_`, `_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`, `_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`, `_portuguese_`, `_romanian_`, `_russian_`, `_sorani_`, `_spanish_`, diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 8e29c8d0830..3163147640a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -56,6 +56,7 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.es.SpanishAnalyzer; +import org.apache.lucene.analysis.et.EstonianAnalyzer; import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.fa.PersianNormalizationFilter; @@ -193,6 +194,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri analyzers.put("danish", DanishAnalyzerProvider::new); analyzers.put("dutch", DutchAnalyzerProvider::new); analyzers.put("english", EnglishAnalyzerProvider::new); + analyzers.put("estonian", EstonianAnalyzerProvider::new); analyzers.put("finnish", FinnishAnalyzerProvider::new); analyzers.put("french", FrenchAnalyzerProvider::new); analyzers.put("galician", GalicianAnalyzerProvider::new); @@ -349,6 +351,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("dutch", CachingStrategy.LUCENE, DutchAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("english", CachingStrategy.LUCENE, EnglishAnalyzer::new)); + analyzers.add(new PreBuiltAnalyzerProviderFactory("estonian", CachingStrategy.LUCENE, EstonianAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("finnish", CachingStrategy.LUCENE, FinnishAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new)); analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new)); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EstonianAnalyzerProvider.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EstonianAnalyzerProvider.java new file mode 100644 index 00000000000..f5f6585c35c --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EstonianAnalyzerProvider.java @@ -0,0 +1,45 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.et.EstonianAnalyzer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; +import org.elasticsearch.index.analysis.Analysis; + +public class EstonianAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final EstonianAnalyzer analyzer; + + EstonianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + analyzer = new EstonianAnalyzer( + Analysis.parseStopWords(env, settings, EstonianAnalyzer.getDefaultStopSet()), + Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) + ); + analyzer.setVersion(version); + } + + @Override + public EstonianAnalyzer get() { return this.analyzer; } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index b94f7f6499a..da38b35ed15 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -65,6 +65,7 @@ import org.tartarus.snowball.ext.CatalanStemmer; import org.tartarus.snowball.ext.DanishStemmer; import org.tartarus.snowball.ext.DutchStemmer; import org.tartarus.snowball.ext.EnglishStemmer; +import org.tartarus.snowball.ext.EstonianStemmer; import org.tartarus.snowball.ext.FinnishStemmer; import org.tartarus.snowball.ext.FrenchStemmer; import org.tartarus.snowball.ext.German2Stemmer; @@ -142,6 +143,9 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(tokenStream); + } else if ("estonian".equalsIgnoreCase(language)) { + return new SnowballFilter(tokenStream, new EstonianStemmer()); + // Finnish stemmers } else if ("finnish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FinnishStemmer()); diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yml index fe5b997974a..306dea5918b 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yml @@ -478,6 +478,35 @@ - length: { tokens: 1 } - match: { tokens.0.token: book } +--- +"estonian": + - do: + indices.create: + index: test + body: + settings: + analysis: + analyzer: + my_analyzer: + type: estonian + + - do: + indices.analyze: + body: + text: teadaolevalt + analyzer: estonian + - length: { tokens: 1 } + - match: { tokens.0.token: teadaole } + + - do: + indices.analyze: + index: test + body: + text: teadaolevalt + analyzer: my_analyzer + - length: { tokens: 1 } + - match: { tokens.0.token: teadaole } + --- "finnish": - do: diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java index 90d77981a76..d1a4f89fee9 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.el.GreekAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.es.SpanishAnalyzer; +import org.apache.lucene.analysis.et.EstonianAnalyzer; import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.fi.FinnishAnalyzer; @@ -127,6 +128,7 @@ public class Analysis { namedStopWords.put("_danish_", DanishAnalyzer.getDefaultStopSet()); namedStopWords.put("_dutch_", DutchAnalyzer.getDefaultStopSet()); namedStopWords.put("_english_", EnglishAnalyzer.getDefaultStopSet()); + namedStopWords.put("_estonian_", EstonianAnalyzer.getDefaultStopSet()); namedStopWords.put("_finnish_", FinnishAnalyzer.getDefaultStopSet()); namedStopWords.put("_french_", FrenchAnalyzer.getDefaultStopSet()); namedStopWords.put("_galician_", GalicianAnalyzer.getDefaultStopSet());