Implement Lucene EstonianAnalyzer, Stemmer (#49149)
This PR adds a new analyzer and stemmer for the Estonian language. Closes #48895
This commit is contained in:
parent
25cc8e3663
commit
7d20b50f45
|
@ -15,6 +15,7 @@ following types are supported:
|
|||
<<danish-analyzer,`danish`>>,
|
||||
<<dutch-analyzer,`dutch`>>,
|
||||
<<english-analyzer,`english`>>,
|
||||
<<estonian-analyzer,`estonian`>>,
|
||||
<<finnish-analyzer,`finnish`>>,
|
||||
<<french-analyzer,`french`>>,
|
||||
<<galician-analyzer,`galician`>>,
|
||||
|
@ -669,6 +670,54 @@ PUT /english_example
|
|||
// TEST[s/"english_keywords",//]
|
||||
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/]
|
||||
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[estonian-analyzer]]
|
||||
===== `estonian` analyzer
|
||||
|
||||
The `estonian` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,console]
|
||||
----------------------------------------------------
|
||||
PUT /estonian_example
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"estonian_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_estonian_" <1>
|
||||
},
|
||||
"estonian_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": ["näide"] <2>
|
||||
},
|
||||
"estonian_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "estonian"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"rebuilt_estonian": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"estonian_stop",
|
||||
"estonian_keywords",
|
||||
"estonian_stemmer"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
// TEST[s/"estonian_keywords",//]
|
||||
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: estonian_example, first: estonian, second: rebuilt_estonian}\nendyaml\n/]
|
||||
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
|
|
|
@ -70,7 +70,7 @@ PUT /my_index
|
|||
Elasticsearch provides the following predefined list of languages:
|
||||
|
||||
`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
|
||||
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
|
||||
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_estonian_`, `_finnish_`,
|
||||
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
|
||||
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
|
||||
`_portuguese_`, `_romanian_`, `_russian_`, `_sorani_`, `_spanish_`,
|
||||
|
|
|
@ -56,6 +56,7 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
|||
import org.apache.lucene.analysis.en.KStemFilter;
|
||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.es.SpanishAnalyzer;
|
||||
import org.apache.lucene.analysis.et.EstonianAnalyzer;
|
||||
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
|
||||
import org.apache.lucene.analysis.fa.PersianAnalyzer;
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
|
@ -193,6 +194,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
analyzers.put("danish", DanishAnalyzerProvider::new);
|
||||
analyzers.put("dutch", DutchAnalyzerProvider::new);
|
||||
analyzers.put("english", EnglishAnalyzerProvider::new);
|
||||
analyzers.put("estonian", EstonianAnalyzerProvider::new);
|
||||
analyzers.put("finnish", FinnishAnalyzerProvider::new);
|
||||
analyzers.put("french", FrenchAnalyzerProvider::new);
|
||||
analyzers.put("galician", GalicianAnalyzerProvider::new);
|
||||
|
@ -349,6 +351,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, Scri
|
|||
analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("dutch", CachingStrategy.LUCENE, DutchAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("english", CachingStrategy.LUCENE, EnglishAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("estonian", CachingStrategy.LUCENE, EstonianAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("finnish", CachingStrategy.LUCENE, FinnishAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
|
||||
analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.et.EstonianAnalyzer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
|
||||
public class EstonianAnalyzerProvider extends AbstractIndexAnalyzerProvider<EstonianAnalyzer> {
|
||||
|
||||
private final EstonianAnalyzer analyzer;
|
||||
|
||||
EstonianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
analyzer = new EstonianAnalyzer(
|
||||
Analysis.parseStopWords(env, settings, EstonianAnalyzer.getDefaultStopSet()),
|
||||
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
|
||||
);
|
||||
analyzer.setVersion(version);
|
||||
}
|
||||
|
||||
@Override
|
||||
public EstonianAnalyzer get() { return this.analyzer; }
|
||||
}
|
|
@ -65,6 +65,7 @@ import org.tartarus.snowball.ext.CatalanStemmer;
|
|||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
import org.tartarus.snowball.ext.DutchStemmer;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
import org.tartarus.snowball.ext.EstonianStemmer;
|
||||
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||
import org.tartarus.snowball.ext.FrenchStemmer;
|
||||
import org.tartarus.snowball.ext.German2Stemmer;
|
||||
|
@ -142,6 +143,9 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
|
||||
return new EnglishPossessiveFilter(tokenStream);
|
||||
|
||||
} else if ("estonian".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new EstonianStemmer());
|
||||
|
||||
// Finnish stemmers
|
||||
} else if ("finnish".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new FinnishStemmer());
|
||||
|
|
|
@ -478,6 +478,35 @@
|
|||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: book }
|
||||
|
||||
---
|
||||
"estonian":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
analyzer:
|
||||
my_analyzer:
|
||||
type: estonian
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: teadaolevalt
|
||||
analyzer: estonian
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: teadaole }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: teadaolevalt
|
||||
analyzer: my_analyzer
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: teadaole }
|
||||
|
||||
---
|
||||
"finnish":
|
||||
- do:
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.de.GermanAnalyzer;
|
|||
import org.apache.lucene.analysis.el.GreekAnalyzer;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
import org.apache.lucene.analysis.es.SpanishAnalyzer;
|
||||
import org.apache.lucene.analysis.et.EstonianAnalyzer;
|
||||
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
|
||||
import org.apache.lucene.analysis.fa.PersianAnalyzer;
|
||||
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
|
||||
|
@ -127,6 +128,7 @@ public class Analysis {
|
|||
namedStopWords.put("_danish_", DanishAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_dutch_", DutchAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_english_", EnglishAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_estonian_", EstonianAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_finnish_", FinnishAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_french_", FrenchAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_galician_", GalicianAnalyzer.getDefaultStopSet());
|
||||
|
|
Loading…
Reference in New Issue