Added Bengali Analyzer to Elasticsearch with respect to the lucene update(PR#238)

2025-03-09 14:34:43 +00:00 · 2017-09-07 04:48:58 +06:00 · 2017-09-07 04:48:58 +06:00 · a40c474e10
commit a40c474e10
parent a978ddf37b
21 changed files with 227 additions and 18 deletions
--- a/core/licenses/lucene-NOTICE.txt
+++ b/core/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.br.BrazilianAnalyzer;
 import org.apache.lucene.analysis.ca.CatalanAnalyzer;
 import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
@ -119,6 +120,7 @@ public class Analysis {
        namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
        namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
        namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
+        namedStopWords.put("_bengali_", BengaliAnalyzer.getDefaultStopSet());
        namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
        namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
        namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());
--- a/core/src/main/java/org/elasticsearch/index/analysis/BengaliAnalyzerProvider.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/BengaliAnalyzerProvider.java
@ -0,0 +1,45 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class BengaliAnalyzerProvider extends AbstractIndexAnalyzerProvider<BengaliAnalyzer> {
+
+    private final BengaliAnalyzer analyzer;
+
+    public BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        analyzer = new BengaliAnalyzer(
+            Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
+            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
+        );
+        analyzer.setVersion(version);
+    }
+
+    @Override
+    public BengaliAnalyzer get() {
+        return this.analyzer;
+    }
+}
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@ -32,6 +32,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
 import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
 import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
 import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
+import org.elasticsearch.index.analysis.BengaliAnalyzerProvider;
 import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
 import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
 import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
@ -270,6 +271,7 @@ public final class AnalysisModule {
        analyzers.register("arabic", ArabicAnalyzerProvider::new);
        analyzers.register("armenian", ArmenianAnalyzerProvider::new);
        analyzers.register("basque", BasqueAnalyzerProvider::new);
+        analyzers.register("bengali", BengaliAnalyzerProvider::new);
        analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
        analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
        analyzers.register("catalan", CatalanAnalyzerProvider::new);
--- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
+import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.br.BrazilianAnalyzer;
 import org.apache.lucene.analysis.ca.CatalanAnalyzer;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
@ -183,6 +184,15 @@ public enum PreBuiltAnalyzers {
        }
    },

+    BENGALI {
+        @Override
+        protected Analyzer create(Version version) {
+            Analyzer a = new BengaliAnalyzer();
+            a.setVersion(version.luceneVersion);
+            return a;
+        }
+    },
+
    BRAZILIAN {
        @Override
        protected Analyzer create(Version version) {
--- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
@ -6,6 +6,7 @@ following types are supported:
 <<arabic-analyzer,`arabic`>>,
 <<armenian-analyzer,`armenian`>>,
 <<basque-analyzer,`basque`>>,
+<<bengali-analyzer,`bengali`>>,
 <<brazilian-analyzer,`brazilian`>>,
 <<bulgarian-analyzer,`bulgarian`>>,
 <<catalan-analyzer,`catalan`>>,
@ -55,7 +56,7 @@ functionality is implemented by adding the
 with the `keywords` set to the value of the `stem_exclusion` parameter.

 The following analyzers support setting custom `stem_exclusion` list:
-`arabic`, `armenian`, `basque`, `bulgarian`, `catalan`, `czech`,
+`arabic`, `armenian`, `basque`, `bengali`, `bulgarian`, `catalan`, `czech`,
 `dutch`, `english`, `finnish`, `french`, `galician`,
 `german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
 `lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
@ -209,6 +210,54 @@ PUT /armenian_example
 <2> This filter should be removed unless there are words which should
    be excluded from stemming.

+[[bengali-analyzer]]
+===== `bengali` analyzer
+
+The `bengali` analyzer could be reimplemented as a `custom` analyzer as follows:
+
+[source,js]
+----------------------------------------------------
+PUT /bengali_example
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "bengali_stop": {
+          "type":       "stop",
+          "stopwords":  "_bengali_" <1>
+        },
+        "bengali_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   ["উদাহরণ"] <2>
+        },
+        "bengali_stemmer": {
+          "type":       "stemmer",
+          "language":   "bengali"
+        }
+      },
+      "analyzer": {
+        "bengali": {
+          "tokenizer":  "standard",
+          "filter": [
+            "lowercase",
+            "indic_normalization",
+            "bengali_normalization",
+            "bengali_stop",
+            "bengali_keywords",
+            "bengali_stemmer"
+          ]
+        }
+      }
+    }
+  }
+}
+----------------------------------------------------
+// CONSOLE
+<1> The default stopwords can be overridden with the `stopwords`
+    or `stopwords_path` parameters.
+<2> This filter should be removed unless there are words which should
+    be excluded from stemming.
+
 [[brazilian-analyzer]]
 ===== `brazilian` analyzer

--- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
@ -44,6 +44,10 @@ Basque::

 http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]

+Bengali::
+http://www.tandfonline.com/doi/abs/10.1080/02564602.1993.11437284[*`bengali`*]
+http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt[*`light_bengali`*]
+
 Brazilian Portuguese::

 http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]
--- a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc
@ -71,7 +71,7 @@ PUT /my_index

 Elasticsearch provides the following predefined list of languages:

-`_arabic_`, `_armenian_`, `_basque_`, `_brazilian_`, `_bulgarian_`,
+`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
 `_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
 `_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
 `_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/BengaliNormalizationFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/BengaliNormalizationFilterFactory.java
@ -0,0 +1,47 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.MultiTermAwareComponent;
+
+/**
+ * Factory for {@link BengaliNormalizationFilter}
+ */
+public class BengaliNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
+
+    BengaliNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new BengaliNormalizationFilter(tokenStream);
+    }
+
+    @Override
+    public Object getMultiTermComponent() {
+        return this;
+    }
+}
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 import org.apache.lucene.analysis.ar.ArabicStemFilter;
+import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
 import org.apache.lucene.analysis.br.BrazilianStemFilter;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
@ -94,6 +95,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
        filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
        filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
        filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
+        filters.put("bengali_normalization", BengaliNormalizationFilterFactory::new);
        filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
        filters.put("cjk_bigram", CJKBigramFilterFactory::new);
        filters.put("cjk_width", CJKWidthFilterFactory::new);
@ -180,6 +182,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
        filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ar.ArabicStemFilter;
 import org.apache.lucene.analysis.bg.BulgarianStemFilter;
+import org.apache.lucene.analysis.bn.BengaliStemFilter;
 import org.apache.lucene.analysis.br.BrazilianStemFilter;
 import org.apache.lucene.analysis.ckb.SoraniStemFilter;
 import org.apache.lucene.analysis.cz.CzechStemFilter;
@ -102,6 +103,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
            return new SnowballFilter(tokenStream, new ArmenianStemmer());
        } else if ("basque".equalsIgnoreCase(language)) {
            return new SnowballFilter(tokenStream, new BasqueStemmer());
+        } else if ("bengali".equalsIgnoreCase(language)) {
+            return new BengaliStemFilter(tokenStream);
        } else if ("brazilian".equalsIgnoreCase(language)) {
            return new BrazilianStemFilter(tokenStream);
        } else if ("bulgarian".equalsIgnoreCase(language)) {
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
@ -67,6 +67,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
        filters.put("uppercase", UpperCaseTokenFilterFactory.class);
        filters.put("ngram", NGramTokenFilterFactory.class);
        filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
+        filters.put("bengalistem", StemmerTokenFilterFactory.class);
        filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
        filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
        filters.put("englishpossessive", StemmerTokenFilterFactory.class);
@ -106,6 +107,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
        filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
        filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
        filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
+        filters.put("bengalinormalization", BengaliNormalizationFilterFactory.class);
        filters.put("germannormalization", GermanNormalizationFilterFactory.class);
        filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
        filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
@ -159,6 +161,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
        filters.put("arabic_normalization", null);
        filters.put("arabic_stem", null);
        filters.put("asciifolding", null);
+        filters.put("bengali_normalization", null);
        filters.put("brazilian_stem", null);
        filters.put("cjk_bigram", null);
        filters.put("cjk_width", null);
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@ -695,6 +695,37 @@
    - length: { tokens: 1 }
    - match:  { tokens.0.token: اجن }

+---
+"bengali_normalization":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_bengali_normalization:
+                    type: bengali_normalization
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      চাঁদ
+            tokenizer: keyword
+            filter:    [my_bengali_normalization]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: চাদ }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text:      চাঁদ
+            tokenizer: keyword
+            filter:    [bengali_normalization]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: চাদ }
+
 ---
 "german_normalization":
    - do:
--- a/modules/lang-expression/licenses/lucene-NOTICE.txt
+++ b/modules/lang-expression/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-icu/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-icu/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-kuromoji/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-kuromoji/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-phonetic/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-phonetic/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-smartcn/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-smartcn/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-stempel/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-stempel/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-ukrainian/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-ukrainian/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@ -112,6 +112,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
        .put("arabicnormalization",       MovedToAnalysisCommon.class)
        .put("arabicstem",                MovedToAnalysisCommon.class)
        .put("asciifolding",              MovedToAnalysisCommon.class)
+        .put("bengalinormalization",      MovedToAnalysisCommon.class)
+        .put("bengalistem",               MovedToAnalysisCommon.class)
        .put("brazilianstem",             MovedToAnalysisCommon.class)
        .put("bulgarianstem",             MovedToAnalysisCommon.class)
        .put("cjkbigram",                 MovedToAnalysisCommon.class)