Added Bengali Analyzer to Elasticsearch with respect to the lucene update(PR#238)

This commit is contained in:
Md. Abdulla-Al-Sun 2017-09-07 04:48:58 +06:00 committed by Martijn van Groningen
parent a978ddf37b
commit a40c474e10
No known key found for this signature in database
GPG Key ID: AB236F4FCF2AF12A
21 changed files with 227 additions and 18 deletions

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer; import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.ckb.SoraniAnalyzer; import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
@ -119,6 +120,7 @@ public class Analysis {
namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet()); namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet()); namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet()); namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
namedStopWords.put("_bengali_", BengaliAnalyzer.getDefaultStopSet());
namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet()); namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet()); namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet()); namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());

View File

@ -0,0 +1,45 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
public class BengaliAnalyzerProvider extends AbstractIndexAnalyzerProvider<BengaliAnalyzer> {
private final BengaliAnalyzer analyzer;
public BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new BengaliAnalyzer(
Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
);
analyzer.setVersion(version);
}
@Override
public BengaliAnalyzer get() {
return this.analyzer;
}
}

View File

@ -32,6 +32,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider; import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider; import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider; import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
import org.elasticsearch.index.analysis.BengaliAnalyzerProvider;
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider; import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider; import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider; import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
@ -270,6 +271,7 @@ public final class AnalysisModule {
analyzers.register("arabic", ArabicAnalyzerProvider::new); analyzers.register("arabic", ArabicAnalyzerProvider::new);
analyzers.register("armenian", ArmenianAnalyzerProvider::new); analyzers.register("armenian", ArmenianAnalyzerProvider::new);
analyzers.register("basque", BasqueAnalyzerProvider::new); analyzers.register("basque", BasqueAnalyzerProvider::new);
analyzers.register("bengali", BengaliAnalyzerProvider::new);
analyzers.register("brazilian", BrazilianAnalyzerProvider::new); analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
analyzers.register("bulgarian", BulgarianAnalyzerProvider::new); analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
analyzers.register("catalan", CatalanAnalyzerProvider::new); analyzers.register("catalan", CatalanAnalyzerProvider::new);

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer; import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer; import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer;
@ -183,6 +184,15 @@ public enum PreBuiltAnalyzers {
} }
}, },
BENGALI {
@Override
protected Analyzer create(Version version) {
Analyzer a = new BengaliAnalyzer();
a.setVersion(version.luceneVersion);
return a;
}
},
BRAZILIAN { BRAZILIAN {
@Override @Override
protected Analyzer create(Version version) { protected Analyzer create(Version version) {

View File

@ -6,6 +6,7 @@ following types are supported:
<<arabic-analyzer,`arabic`>>, <<arabic-analyzer,`arabic`>>,
<<armenian-analyzer,`armenian`>>, <<armenian-analyzer,`armenian`>>,
<<basque-analyzer,`basque`>>, <<basque-analyzer,`basque`>>,
<<bengali-analyzer,`bengali`>>,
<<brazilian-analyzer,`brazilian`>>, <<brazilian-analyzer,`brazilian`>>,
<<bulgarian-analyzer,`bulgarian`>>, <<bulgarian-analyzer,`bulgarian`>>,
<<catalan-analyzer,`catalan`>>, <<catalan-analyzer,`catalan`>>,
@ -55,7 +56,7 @@ functionality is implemented by adding the
with the `keywords` set to the value of the `stem_exclusion` parameter. with the `keywords` set to the value of the `stem_exclusion` parameter.
The following analyzers support setting custom `stem_exclusion` list: The following analyzers support setting custom `stem_exclusion` list:
`arabic`, `armenian`, `basque`, `bulgarian`, `catalan`, `czech`, `arabic`, `armenian`, `basque`, `bengali`, `bulgarian`, `catalan`, `czech`,
`dutch`, `english`, `finnish`, `french`, `galician`, `dutch`, `english`, `finnish`, `french`, `galician`,
`german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`, `german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
`lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`, `lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
@ -209,6 +210,54 @@ PUT /armenian_example
<2> This filter should be removed unless there are words which should <2> This filter should be removed unless there are words which should
be excluded from stemming. be excluded from stemming.
[[bengali-analyzer]]
===== `bengali` analyzer
The `bengali` analyzer could be reimplemented as a `custom` analyzer as follows:
[source,js]
----------------------------------------------------
PUT /bengali_example
{
"settings": {
"analysis": {
"filter": {
"bengali_stop": {
"type": "stop",
"stopwords": "_bengali_" <1>
},
"bengali_keywords": {
"type": "keyword_marker",
"keywords": ["উদাহরণ"] <2>
},
"bengali_stemmer": {
"type": "stemmer",
"language": "bengali"
}
},
"analyzer": {
"bengali": {
"tokenizer": "standard",
"filter": [
"lowercase",
"indic_normalization",
"bengali_normalization",
"bengali_stop",
"bengali_keywords",
"bengali_stemmer"
]
}
}
}
}
}
----------------------------------------------------
// CONSOLE
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
be excluded from stemming.
[[brazilian-analyzer]] [[brazilian-analyzer]]
===== `brazilian` analyzer ===== `brazilian` analyzer

View File

@ -44,6 +44,10 @@ Basque::
http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*] http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
Bengali::
http://www.tandfonline.com/doi/abs/10.1080/02564602.1993.11437284[*`bengali`*]
http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt[*`light_bengali`*]
Brazilian Portuguese:: Brazilian Portuguese::
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*] http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]

View File

@ -71,7 +71,7 @@ PUT /my_index
Elasticsearch provides the following predefined list of languages: Elasticsearch provides the following predefined list of languages:
`_arabic_`, `_armenian_`, `_basque_`, `_brazilian_`, `_bulgarian_`, `_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`, `_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`, `_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`, `_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,

View File

@ -0,0 +1,47 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
/**
* Factory for {@link BengaliNormalizationFilter}
*/
public class BengaliNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
BengaliNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new BengaliNormalizationFilter(tokenStream);
}
@Override
public Object getMultiTermComponent() {
return this;
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
import org.apache.lucene.analysis.br.BrazilianStemFilter; import org.apache.lucene.analysis.br.BrazilianStemFilter;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.cjk.CJKBigramFilter;
@ -94,6 +95,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new); filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
filters.put("arabic_stem", ArabicStemTokenFilterFactory::new); filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
filters.put("bengali_normalization", BengaliNormalizationFilterFactory::new);
filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new); filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
filters.put("cjk_bigram", CJKBigramFilterFactory::new); filters.put("cjk_bigram", CJKBigramFilterFactory::new);
filters.put("cjk_width", CJKWidthFilterFactory::new); filters.put("cjk_width", CJKWidthFilterFactory::new);
@ -180,6 +182,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));

View File

@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.bg.BulgarianStemFilter; import org.apache.lucene.analysis.bg.BulgarianStemFilter;
import org.apache.lucene.analysis.bn.BengaliStemFilter;
import org.apache.lucene.analysis.br.BrazilianStemFilter; import org.apache.lucene.analysis.br.BrazilianStemFilter;
import org.apache.lucene.analysis.ckb.SoraniStemFilter; import org.apache.lucene.analysis.ckb.SoraniStemFilter;
import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.cz.CzechStemFilter;
@ -102,6 +103,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
return new SnowballFilter(tokenStream, new ArmenianStemmer()); return new SnowballFilter(tokenStream, new ArmenianStemmer());
} else if ("basque".equalsIgnoreCase(language)) { } else if ("basque".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new BasqueStemmer()); return new SnowballFilter(tokenStream, new BasqueStemmer());
} else if ("bengali".equalsIgnoreCase(language)) {
return new BengaliStemFilter(tokenStream);
} else if ("brazilian".equalsIgnoreCase(language)) { } else if ("brazilian".equalsIgnoreCase(language)) {
return new BrazilianStemFilter(tokenStream); return new BrazilianStemFilter(tokenStream);
} else if ("bulgarian".equalsIgnoreCase(language)) { } else if ("bulgarian".equalsIgnoreCase(language)) {

View File

@ -67,6 +67,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
filters.put("uppercase", UpperCaseTokenFilterFactory.class); filters.put("uppercase", UpperCaseTokenFilterFactory.class);
filters.put("ngram", NGramTokenFilterFactory.class); filters.put("ngram", NGramTokenFilterFactory.class);
filters.put("edgengram", EdgeNGramTokenFilterFactory.class); filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
filters.put("bengalistem", StemmerTokenFilterFactory.class);
filters.put("bulgarianstem", StemmerTokenFilterFactory.class); filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
filters.put("englishminimalstem", StemmerTokenFilterFactory.class); filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
filters.put("englishpossessive", StemmerTokenFilterFactory.class); filters.put("englishpossessive", StemmerTokenFilterFactory.class);
@ -106,6 +107,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
filters.put("patternreplace", PatternReplaceTokenFilterFactory.class); filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class); filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class); filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
filters.put("bengalinormalization", BengaliNormalizationFilterFactory.class);
filters.put("germannormalization", GermanNormalizationFilterFactory.class); filters.put("germannormalization", GermanNormalizationFilterFactory.class);
filters.put("hindinormalization", HindiNormalizationFilterFactory.class); filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
filters.put("indicnormalization", IndicNormalizationFilterFactory.class); filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
@ -159,6 +161,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
filters.put("arabic_normalization", null); filters.put("arabic_normalization", null);
filters.put("arabic_stem", null); filters.put("arabic_stem", null);
filters.put("asciifolding", null); filters.put("asciifolding", null);
filters.put("bengali_normalization", null);
filters.put("brazilian_stem", null); filters.put("brazilian_stem", null);
filters.put("cjk_bigram", null); filters.put("cjk_bigram", null);
filters.put("cjk_width", null); filters.put("cjk_width", null);

View File

@ -695,6 +695,37 @@
- length: { tokens: 1 } - length: { tokens: 1 }
- match: { tokens.0.token: اجن } - match: { tokens.0.token: اجن }
---
"bengali_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_bengali_normalization:
type: bengali_normalization
- do:
indices.analyze:
index: test
body:
text: চাঁদ
tokenizer: keyword
filter: [my_bengali_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: চাদ }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: চাঁদ
tokenizer: keyword
filter: [bengali_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: চাদ }
--- ---
"german_normalization": "german_normalization":
- do: - do:

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license. under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html. See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -112,6 +112,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("arabicnormalization", MovedToAnalysisCommon.class) .put("arabicnormalization", MovedToAnalysisCommon.class)
.put("arabicstem", MovedToAnalysisCommon.class) .put("arabicstem", MovedToAnalysisCommon.class)
.put("asciifolding", MovedToAnalysisCommon.class) .put("asciifolding", MovedToAnalysisCommon.class)
.put("bengalinormalization", MovedToAnalysisCommon.class)
.put("bengalistem", MovedToAnalysisCommon.class)
.put("brazilianstem", MovedToAnalysisCommon.class) .put("brazilianstem", MovedToAnalysisCommon.class)
.put("bulgarianstem", MovedToAnalysisCommon.class) .put("bulgarianstem", MovedToAnalysisCommon.class)
.put("cjkbigram", MovedToAnalysisCommon.class) .put("cjkbigram", MovedToAnalysisCommon.class)