Implement Lucene EstonianAnalyzer, Stemmer (#49149)

This PR adds a new analyzer and stemmer for the Estonian language.

Closes #48895
This commit is contained in:
gpaimla 2019-11-18 18:19:54 +02:00 committed by Christoph Büscher
parent 25cc8e3663
commit 7d20b50f45
7 changed files with 133 additions and 1 deletions

View File

@ -15,6 +15,7 @@ following types are supported:
<<danish-analyzer,`danish`>>,
<<dutch-analyzer,`dutch`>>,
<<english-analyzer,`english`>>,
<<estonian-analyzer,`estonian`>>,
<<finnish-analyzer,`finnish`>>,
<<french-analyzer,`french`>>,
<<galician-analyzer,`galician`>>,
@ -669,6 +670,54 @@ PUT /english_example
// TEST[s/"english_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
be excluded from stemming.
[[estonian-analyzer]]
===== `estonian` analyzer
The `estonian` analyzer could be reimplemented as a `custom` analyzer as follows:
[source,console]
----------------------------------------------------
PUT /estonian_example
{
"settings": {
"analysis": {
"filter": {
"estonian_stop": {
"type": "stop",
"stopwords": "_estonian_" <1>
},
"estonian_keywords": {
"type": "keyword_marker",
"keywords": ["näide"] <2>
},
"estonian_stemmer": {
"type": "stemmer",
"language": "estonian"
}
},
"analyzer": {
"rebuilt_estonian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"estonian_stop",
"estonian_keywords",
"estonian_stemmer"
]
}
}
}
}
}
----------------------------------------------------
// TEST[s/"estonian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: estonian_example, first: estonian, second: rebuilt_estonian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should

View File

@ -70,7 +70,7 @@ PUT /my_index
Elasticsearch provides the following predefined list of languages:
`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_estonian_`, `_finnish_`,
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
`_portuguese_`, `_romanian_`, `_russian_`, `_sorani_`, `_spanish_`,

View File

@ -56,6 +56,7 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
@ -193,6 +194,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
analyzers.put("danish", DanishAnalyzerProvider::new);
analyzers.put("dutch", DutchAnalyzerProvider::new);
analyzers.put("english", EnglishAnalyzerProvider::new);
analyzers.put("estonian", EstonianAnalyzerProvider::new);
analyzers.put("finnish", FinnishAnalyzerProvider::new);
analyzers.put("french", FrenchAnalyzerProvider::new);
analyzers.put("galician", GalicianAnalyzerProvider::new);
@ -349,6 +351,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("dutch", CachingStrategy.LUCENE, DutchAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("english", CachingStrategy.LUCENE, EnglishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("estonian", CachingStrategy.LUCENE, EstonianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("finnish", CachingStrategy.LUCENE, FinnishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));

View File

@ -0,0 +1,45 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;
public class EstonianAnalyzerProvider extends AbstractIndexAnalyzerProvider<EstonianAnalyzer> {
private final EstonianAnalyzer analyzer;
EstonianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new EstonianAnalyzer(
Analysis.parseStopWords(env, settings, EstonianAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
);
analyzer.setVersion(version);
}
@Override
public EstonianAnalyzer get() { return this.analyzer; }
}

View File

@ -65,6 +65,7 @@ import org.tartarus.snowball.ext.CatalanStemmer;
import org.tartarus.snowball.ext.DanishStemmer;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.EnglishStemmer;
import org.tartarus.snowball.ext.EstonianStemmer;
import org.tartarus.snowball.ext.FinnishStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import org.tartarus.snowball.ext.German2Stemmer;
@ -142,6 +143,9 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
return new EnglishPossessiveFilter(tokenStream);
} else if ("estonian".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new EstonianStemmer());
// Finnish stemmers
} else if ("finnish".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new FinnishStemmer());

View File

@ -478,6 +478,35 @@
- length: { tokens: 1 }
- match: { tokens.0.token: book }
---
"estonian":
- do:
indices.create:
index: test
body:
settings:
analysis:
analyzer:
my_analyzer:
type: estonian
- do:
indices.analyze:
body:
text: teadaolevalt
analyzer: estonian
- length: { tokens: 1 }
- match: { tokens.0.token: teadaole }
- do:
indices.analyze:
index: test
body:
text: teadaolevalt
analyzer: my_analyzer
- length: { tokens: 1 }
- match: { tokens.0.token: teadaole }
---
"finnish":
- do:

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
@ -127,6 +128,7 @@ public class Analysis {
namedStopWords.put("_danish_", DanishAnalyzer.getDefaultStopSet());
namedStopWords.put("_dutch_", DutchAnalyzer.getDefaultStopSet());
namedStopWords.put("_english_", EnglishAnalyzer.getDefaultStopSet());
namedStopWords.put("_estonian_", EstonianAnalyzer.getDefaultStopSet());
namedStopWords.put("_finnish_", FinnishAnalyzer.getDefaultStopSet());
namedStopWords.put("_french_", FrenchAnalyzer.getDefaultStopSet());
namedStopWords.put("_galician_", GalicianAnalyzer.getDefaultStopSet());