Added Bengali Analyzer to Elasticsearch with respect to the lucene update(PR#238)
This commit is contained in:
parent
a978ddf37b
commit
a40c474e10
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||||
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
|
||||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||||
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
||||||
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
|
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
|
||||||
|
@ -119,6 +120,7 @@ public class Analysis {
|
||||||
namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
|
namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
|
||||||
namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
|
namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
|
||||||
namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
|
namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
|
||||||
|
namedStopWords.put("_bengali_", BengaliAnalyzer.getDefaultStopSet());
|
||||||
namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
|
namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
|
||||||
namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
|
namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
|
||||||
namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());
|
namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
|
||||||
|
public class BengaliAnalyzerProvider extends AbstractIndexAnalyzerProvider<BengaliAnalyzer> {
|
||||||
|
|
||||||
|
private final BengaliAnalyzer analyzer;
|
||||||
|
|
||||||
|
public BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
super(indexSettings, name, settings);
|
||||||
|
analyzer = new BengaliAnalyzer(
|
||||||
|
Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
|
||||||
|
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
|
||||||
|
);
|
||||||
|
analyzer.setVersion(version);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BengaliAnalyzer get() {
|
||||||
|
return this.analyzer;
|
||||||
|
}
|
||||||
|
}
|
|
@ -32,6 +32,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
|
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
|
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
|
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
|
||||||
|
import org.elasticsearch.index.analysis.BengaliAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
|
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
|
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
|
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
|
||||||
|
@ -270,6 +271,7 @@ public final class AnalysisModule {
|
||||||
analyzers.register("arabic", ArabicAnalyzerProvider::new);
|
analyzers.register("arabic", ArabicAnalyzerProvider::new);
|
||||||
analyzers.register("armenian", ArmenianAnalyzerProvider::new);
|
analyzers.register("armenian", ArmenianAnalyzerProvider::new);
|
||||||
analyzers.register("basque", BasqueAnalyzerProvider::new);
|
analyzers.register("basque", BasqueAnalyzerProvider::new);
|
||||||
|
analyzers.register("bengali", BengaliAnalyzerProvider::new);
|
||||||
analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
|
analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
|
||||||
analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
|
analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
|
||||||
analyzers.register("catalan", CatalanAnalyzerProvider::new);
|
analyzers.register("catalan", CatalanAnalyzerProvider::new);
|
||||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||||
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
|
||||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||||
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
||||||
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
||||||
|
@ -183,6 +184,15 @@ public enum PreBuiltAnalyzers {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
BENGALI {
|
||||||
|
@Override
|
||||||
|
protected Analyzer create(Version version) {
|
||||||
|
Analyzer a = new BengaliAnalyzer();
|
||||||
|
a.setVersion(version.luceneVersion);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
BRAZILIAN {
|
BRAZILIAN {
|
||||||
@Override
|
@Override
|
||||||
protected Analyzer create(Version version) {
|
protected Analyzer create(Version version) {
|
||||||
|
|
|
@ -6,6 +6,7 @@ following types are supported:
|
||||||
<<arabic-analyzer,`arabic`>>,
|
<<arabic-analyzer,`arabic`>>,
|
||||||
<<armenian-analyzer,`armenian`>>,
|
<<armenian-analyzer,`armenian`>>,
|
||||||
<<basque-analyzer,`basque`>>,
|
<<basque-analyzer,`basque`>>,
|
||||||
|
<<bengali-analyzer,`bengali`>>,
|
||||||
<<brazilian-analyzer,`brazilian`>>,
|
<<brazilian-analyzer,`brazilian`>>,
|
||||||
<<bulgarian-analyzer,`bulgarian`>>,
|
<<bulgarian-analyzer,`bulgarian`>>,
|
||||||
<<catalan-analyzer,`catalan`>>,
|
<<catalan-analyzer,`catalan`>>,
|
||||||
|
@ -55,7 +56,7 @@ functionality is implemented by adding the
|
||||||
with the `keywords` set to the value of the `stem_exclusion` parameter.
|
with the `keywords` set to the value of the `stem_exclusion` parameter.
|
||||||
|
|
||||||
The following analyzers support setting custom `stem_exclusion` list:
|
The following analyzers support setting custom `stem_exclusion` list:
|
||||||
`arabic`, `armenian`, `basque`, `bulgarian`, `catalan`, `czech`,
|
`arabic`, `armenian`, `basque`, `bengali`, `bulgarian`, `catalan`, `czech`,
|
||||||
`dutch`, `english`, `finnish`, `french`, `galician`,
|
`dutch`, `english`, `finnish`, `french`, `galician`,
|
||||||
`german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
|
`german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
|
||||||
`lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
|
`lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
|
||||||
|
@ -209,6 +210,54 @@ PUT /armenian_example
|
||||||
<2> This filter should be removed unless there are words which should
|
<2> This filter should be removed unless there are words which should
|
||||||
be excluded from stemming.
|
be excluded from stemming.
|
||||||
|
|
||||||
|
[[bengali-analyzer]]
|
||||||
|
===== `bengali` analyzer
|
||||||
|
|
||||||
|
The `bengali` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
----------------------------------------------------
|
||||||
|
PUT /bengali_example
|
||||||
|
{
|
||||||
|
"settings": {
|
||||||
|
"analysis": {
|
||||||
|
"filter": {
|
||||||
|
"bengali_stop": {
|
||||||
|
"type": "stop",
|
||||||
|
"stopwords": "_bengali_" <1>
|
||||||
|
},
|
||||||
|
"bengali_keywords": {
|
||||||
|
"type": "keyword_marker",
|
||||||
|
"keywords": ["উদাহরণ"] <2>
|
||||||
|
},
|
||||||
|
"bengali_stemmer": {
|
||||||
|
"type": "stemmer",
|
||||||
|
"language": "bengali"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"analyzer": {
|
||||||
|
"bengali": {
|
||||||
|
"tokenizer": "standard",
|
||||||
|
"filter": [
|
||||||
|
"lowercase",
|
||||||
|
"indic_normalization",
|
||||||
|
"bengali_normalization",
|
||||||
|
"bengali_stop",
|
||||||
|
"bengali_keywords",
|
||||||
|
"bengali_stemmer"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
----------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
<1> The default stopwords can be overridden with the `stopwords`
|
||||||
|
or `stopwords_path` parameters.
|
||||||
|
<2> This filter should be removed unless there are words which should
|
||||||
|
be excluded from stemming.
|
||||||
|
|
||||||
[[brazilian-analyzer]]
|
[[brazilian-analyzer]]
|
||||||
===== `brazilian` analyzer
|
===== `brazilian` analyzer
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,10 @@ Basque::
|
||||||
|
|
||||||
http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
|
http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
|
||||||
|
|
||||||
|
Bengali::
|
||||||
|
http://www.tandfonline.com/doi/abs/10.1080/02564602.1993.11437284[*`bengali`*]
|
||||||
|
http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt[*`light_bengali`*]
|
||||||
|
|
||||||
Brazilian Portuguese::
|
Brazilian Portuguese::
|
||||||
|
|
||||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]
|
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]
|
||||||
|
|
|
@ -71,7 +71,7 @@ PUT /my_index
|
||||||
|
|
||||||
Elasticsearch provides the following predefined list of languages:
|
Elasticsearch provides the following predefined list of languages:
|
||||||
|
|
||||||
`_arabic_`, `_armenian_`, `_basque_`, `_brazilian_`, `_bulgarian_`,
|
`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
|
||||||
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
|
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
|
||||||
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
|
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
|
||||||
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
|
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link BengaliNormalizationFilter}
|
||||||
|
*/
|
||||||
|
public class BengaliNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
|
BengaliNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
|
super(indexSettings, name, settings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new BengaliNormalizationFilter(tokenStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||||
|
import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||||
|
@ -94,6 +95,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||||
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
||||||
filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
|
filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
|
||||||
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
||||||
|
filters.put("bengali_normalization", BengaliNormalizationFilterFactory::new);
|
||||||
filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
|
filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
|
||||||
filters.put("cjk_bigram", CJKBigramFilterFactory::new);
|
filters.put("cjk_bigram", CJKBigramFilterFactory::new);
|
||||||
filters.put("cjk_width", CJKWidthFilterFactory::new);
|
filters.put("cjk_width", CJKWidthFilterFactory::new);
|
||||||
|
@ -180,6 +182,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||||
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
|
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
|
||||||
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
|
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
|
||||||
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
|
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
|
||||||
|
filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
|
||||||
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
|
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
|
||||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
|
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
|
||||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
|
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
|
||||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||||
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
|
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
|
||||||
|
import org.apache.lucene.analysis.bn.BengaliStemFilter;
|
||||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||||
import org.apache.lucene.analysis.ckb.SoraniStemFilter;
|
import org.apache.lucene.analysis.ckb.SoraniStemFilter;
|
||||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||||
|
@ -102,6 +103,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
return new SnowballFilter(tokenStream, new ArmenianStemmer());
|
return new SnowballFilter(tokenStream, new ArmenianStemmer());
|
||||||
} else if ("basque".equalsIgnoreCase(language)) {
|
} else if ("basque".equalsIgnoreCase(language)) {
|
||||||
return new SnowballFilter(tokenStream, new BasqueStemmer());
|
return new SnowballFilter(tokenStream, new BasqueStemmer());
|
||||||
|
} else if ("bengali".equalsIgnoreCase(language)) {
|
||||||
|
return new BengaliStemFilter(tokenStream);
|
||||||
} else if ("brazilian".equalsIgnoreCase(language)) {
|
} else if ("brazilian".equalsIgnoreCase(language)) {
|
||||||
return new BrazilianStemFilter(tokenStream);
|
return new BrazilianStemFilter(tokenStream);
|
||||||
} else if ("bulgarian".equalsIgnoreCase(language)) {
|
} else if ("bulgarian".equalsIgnoreCase(language)) {
|
||||||
|
|
|
@ -67,6 +67,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
filters.put("uppercase", UpperCaseTokenFilterFactory.class);
|
filters.put("uppercase", UpperCaseTokenFilterFactory.class);
|
||||||
filters.put("ngram", NGramTokenFilterFactory.class);
|
filters.put("ngram", NGramTokenFilterFactory.class);
|
||||||
filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
|
filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
|
||||||
|
filters.put("bengalistem", StemmerTokenFilterFactory.class);
|
||||||
filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
|
filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
|
||||||
filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
|
filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
|
||||||
filters.put("englishpossessive", StemmerTokenFilterFactory.class);
|
filters.put("englishpossessive", StemmerTokenFilterFactory.class);
|
||||||
|
@ -106,6 +107,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
|
filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
|
||||||
filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
|
filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
|
||||||
filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
|
filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
|
||||||
|
filters.put("bengalinormalization", BengaliNormalizationFilterFactory.class);
|
||||||
filters.put("germannormalization", GermanNormalizationFilterFactory.class);
|
filters.put("germannormalization", GermanNormalizationFilterFactory.class);
|
||||||
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
|
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
|
||||||
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
|
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
|
||||||
|
@ -159,6 +161,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
filters.put("arabic_normalization", null);
|
filters.put("arabic_normalization", null);
|
||||||
filters.put("arabic_stem", null);
|
filters.put("arabic_stem", null);
|
||||||
filters.put("asciifolding", null);
|
filters.put("asciifolding", null);
|
||||||
|
filters.put("bengali_normalization", null);
|
||||||
filters.put("brazilian_stem", null);
|
filters.put("brazilian_stem", null);
|
||||||
filters.put("cjk_bigram", null);
|
filters.put("cjk_bigram", null);
|
||||||
filters.put("cjk_width", null);
|
filters.put("cjk_width", null);
|
||||||
|
|
|
@ -695,6 +695,37 @@
|
||||||
- length: { tokens: 1 }
|
- length: { tokens: 1 }
|
||||||
- match: { tokens.0.token: اجن }
|
- match: { tokens.0.token: اجن }
|
||||||
|
|
||||||
|
---
|
||||||
|
"bengali_normalization":
|
||||||
|
- do:
|
||||||
|
indices.create:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
settings:
|
||||||
|
analysis:
|
||||||
|
filter:
|
||||||
|
my_bengali_normalization:
|
||||||
|
type: bengali_normalization
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
index: test
|
||||||
|
body:
|
||||||
|
text: চাঁদ
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: [my_bengali_normalization]
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: চাদ }
|
||||||
|
|
||||||
|
# Test pre-configured token filter too:
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: চাঁদ
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: [bengali_normalization]
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: চাদ }
|
||||||
|
|
||||||
---
|
---
|
||||||
"german_normalization":
|
"german_normalization":
|
||||||
- do:
|
- do:
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
|
|
@ -112,6 +112,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
.put("arabicnormalization", MovedToAnalysisCommon.class)
|
.put("arabicnormalization", MovedToAnalysisCommon.class)
|
||||||
.put("arabicstem", MovedToAnalysisCommon.class)
|
.put("arabicstem", MovedToAnalysisCommon.class)
|
||||||
.put("asciifolding", MovedToAnalysisCommon.class)
|
.put("asciifolding", MovedToAnalysisCommon.class)
|
||||||
|
.put("bengalinormalization", MovedToAnalysisCommon.class)
|
||||||
|
.put("bengalistem", MovedToAnalysisCommon.class)
|
||||||
.put("brazilianstem", MovedToAnalysisCommon.class)
|
.put("brazilianstem", MovedToAnalysisCommon.class)
|
||||||
.put("bulgarianstem", MovedToAnalysisCommon.class)
|
.put("bulgarianstem", MovedToAnalysisCommon.class)
|
||||||
.put("cjkbigram", MovedToAnalysisCommon.class)
|
.put("cjkbigram", MovedToAnalysisCommon.class)
|
||||||
|
|
Loading…
Reference in New Issue