Added Bengali Analyzer to Elasticsearch with respect to the lucene update(PR#238)

2017-09-07 04:48:58 +06:00 · 2017-09-07 04:48:58 +06:00 · a40c474e10
parent a978ddf37b
commit a40c474e10
21 changed files with 227 additions and 18 deletions
--- a/core/licenses/lucene-NOTICE.txt
+++ b/core/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
 import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.br.BrazilianAnalyzer;
 import org.apache.lucene.analysis.ca.CatalanAnalyzer;
 import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
@ -119,6 +120,7 @@ public class Analysis {
        namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
        namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
        namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
        namedStopWords.put("_bengali_", BengaliAnalyzer.getDefaultStopSet());
        namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
        namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
        namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());
--- a/core/src/main/java/org/elasticsearch/index/analysis/BengaliAnalyzerProvider.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/BengaliAnalyzerProvider.java
@ -0,0 +1,45 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 public class BengaliAnalyzerProvider extends AbstractIndexAnalyzerProvider<BengaliAnalyzer> {
    private final BengaliAnalyzer analyzer;
    public BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
        super(indexSettings, name, settings);
        analyzer = new BengaliAnalyzer(
            Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
        );
        analyzer.setVersion(version);
    }
    @Override
    public BengaliAnalyzer get() {
        return this.analyzer;
    }
 }
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@ -32,6 +32,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
 import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
 import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
 import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
 import org.elasticsearch.index.analysis.BengaliAnalyzerProvider;
 import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
 import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
 import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
@ -270,6 +271,7 @@ public final class AnalysisModule {
        analyzers.register("arabic", ArabicAnalyzerProvider::new);
        analyzers.register("armenian", ArmenianAnalyzerProvider::new);
        analyzers.register("basque", BasqueAnalyzerProvider::new);
        analyzers.register("bengali", BengaliAnalyzerProvider::new);
        analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
        analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
        analyzers.register("catalan", CatalanAnalyzerProvider::new);
--- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ar.ArabicAnalyzer;
 import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
 import org.apache.lucene.analysis.bn.BengaliAnalyzer;
 import org.apache.lucene.analysis.br.BrazilianAnalyzer;
 import org.apache.lucene.analysis.ca.CatalanAnalyzer;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
@ -183,6 +184,15 @@ public enum PreBuiltAnalyzers {
        }
    },
    BENGALI {
        @Override
        protected Analyzer create(Version version) {
            Analyzer a = new BengaliAnalyzer();
            a.setVersion(version.luceneVersion);
            return a;
        }
    },
    BRAZILIAN {
        @Override
        protected Analyzer create(Version version) {
--- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
@ -6,6 +6,7 @@ following types are supported:
 <<arabic-analyzer,`arabic`>>,
 <<armenian-analyzer,`armenian`>>,
 <<basque-analyzer,`basque`>>,
 <<bengali-analyzer,`bengali`>>,
 <<brazilian-analyzer,`brazilian`>>,
 <<bulgarian-analyzer,`bulgarian`>>,
 <<catalan-analyzer,`catalan`>>,
@ -55,7 +56,7 @@ functionality is implemented by adding the
 with the `keywords` set to the value of the `stem_exclusion` parameter.
 The following analyzers support setting custom `stem_exclusion` list:
-`arabic`, `armenian`, `basque`, `bulgarian`, `catalan`, `czech`,
+`arabic`, `armenian`, `basque`, `bengali`, `bulgarian`, `catalan`, `czech`,
 `dutch`, `english`, `finnish`, `french`, `galician`,
 `german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
 `lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
@ -209,6 +210,54 @@ PUT /armenian_example
 <2> This filter should be removed unless there are words which should
    be excluded from stemming.
 [[bengali-analyzer]]
 ===== `bengali` analyzer
 The `bengali` analyzer could be reimplemented as a `custom` analyzer as follows:
 [source,js]
 ----------------------------------------------------
 PUT /bengali_example
 {
  "settings": {
    "analysis": {
      "filter": {
        "bengali_stop": {
          "type":       "stop",
          "stopwords":  "_bengali_" <1>
        },
        "bengali_keywords": {
          "type":       "keyword_marker",
          "keywords":   ["উদাহরণ"] <2>
        },
        "bengali_stemmer": {
          "type":       "stemmer",
          "language":   "bengali"
        }
      },
      "analyzer": {
        "bengali": {
          "tokenizer":  "standard",
          "filter": [
            "lowercase",
            "indic_normalization",
            "bengali_normalization",
            "bengali_stop",
            "bengali_keywords",
            "bengali_stemmer"
          ]
        }
      }
    }
  }
 }
 ----------------------------------------------------
 // CONSOLE
 <1> The default stopwords can be overridden with the `stopwords`
    or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
    be excluded from stemming.
 [[brazilian-analyzer]]
 ===== `brazilian` analyzer
--- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
@ -44,6 +44,10 @@ Basque::
 http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
 Bengali::
 http://www.tandfonline.com/doi/abs/10.1080/02564602.1993.11437284[*`bengali`*]
 http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt[*`light_bengali`*]
 Brazilian Portuguese::
 http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]
--- a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc
@ -71,7 +71,7 @@ PUT /my_index
 Elasticsearch provides the following predefined list of languages:
-`_arabic_`, `_armenian_`, `_basque_`, `_brazilian_`, `_bulgarian_`,
+`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
 `_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
 `_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
 `_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/BengaliNormalizationFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/BengaliNormalizationFilterFactory.java
@ -0,0 +1,47 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 /**
 * Factory for {@link BengaliNormalizationFilter}
 */
 public class BengaliNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
    BengaliNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
        super(indexSettings, name, settings);
    }
    @Override
    public TokenStream create(TokenStream tokenStream) {
        return new BengaliNormalizationFilter(tokenStream);
    }
    @Override
    public Object getMultiTermComponent() {
        return this;
    }
 }
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 import org.apache.lucene.analysis.ar.ArabicStemFilter;
 import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
 import org.apache.lucene.analysis.br.BrazilianStemFilter;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
@ -94,6 +95,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
        filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
        filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
        filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
        filters.put("bengali_normalization", BengaliNormalizationFilterFactory::new);
        filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
        filters.put("cjk_bigram", CJKBigramFilterFactory::new);
        filters.put("cjk_width", CJKWidthFilterFactory::new);
@ -180,6 +182,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
        filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
        filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ar.ArabicStemFilter;
 import org.apache.lucene.analysis.bg.BulgarianStemFilter;
 import org.apache.lucene.analysis.bn.BengaliStemFilter;
 import org.apache.lucene.analysis.br.BrazilianStemFilter;
 import org.apache.lucene.analysis.ckb.SoraniStemFilter;
 import org.apache.lucene.analysis.cz.CzechStemFilter;
@ -102,6 +103,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
            return new SnowballFilter(tokenStream, new ArmenianStemmer());
        } else if ("basque".equalsIgnoreCase(language)) {
            return new SnowballFilter(tokenStream, new BasqueStemmer());
        } else if ("bengali".equalsIgnoreCase(language)) {
            return new BengaliStemFilter(tokenStream);
        } else if ("brazilian".equalsIgnoreCase(language)) {
            return new BrazilianStemFilter(tokenStream);
        } else if ("bulgarian".equalsIgnoreCase(language)) {
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
@ -67,6 +67,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
        filters.put("uppercase", UpperCaseTokenFilterFactory.class);
        filters.put("ngram", NGramTokenFilterFactory.class);
        filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
        filters.put("bengalistem", StemmerTokenFilterFactory.class);
        filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
        filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
        filters.put("englishpossessive", StemmerTokenFilterFactory.class);
@ -106,6 +107,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
        filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
        filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
        filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
        filters.put("bengalinormalization", BengaliNormalizationFilterFactory.class);
        filters.put("germannormalization", GermanNormalizationFilterFactory.class);
        filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
        filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
@ -159,6 +161,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
        filters.put("arabic_normalization", null);
        filters.put("arabic_stem", null);
        filters.put("asciifolding", null);
        filters.put("bengali_normalization", null);
        filters.put("brazilian_stem", null);
        filters.put("cjk_bigram", null);
        filters.put("cjk_width", null);
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@ -695,6 +695,37 @@
    - length: { tokens: 1 }
    - match:  { tokens.0.token: اجن }
 ---
 "bengali_normalization":
    - do:
        indices.create:
          index: test
          body:
            settings:
              analysis:
                filter:
                  my_bengali_normalization:
                    type: bengali_normalization
    - do:
        indices.analyze:
          index: test
          body:
            text:      চাঁদ
            tokenizer: keyword
            filter:    [my_bengali_normalization]
    - length: { tokens: 1 }
    - match:  { tokens.0.token: চাদ }
    # Test pre-configured token filter too:
    - do:
        indices.analyze:
          body:
            text:      চাঁদ
            tokenizer: keyword
            filter:    [bengali_normalization]
    - length: { tokens: 1 }
    - match:  { tokens.0.token: চাদ }
 ---
 "german_normalization":
    - do:
--- a/modules/lang-expression/licenses/lucene-NOTICE.txt
+++ b/modules/lang-expression/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-icu/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-icu/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-kuromoji/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-kuromoji/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-phonetic/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-phonetic/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-smartcn/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-smartcn/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-stempel/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-stempel/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/plugins/analysis-ukrainian/licenses/lucene-NOTICE.txt
+++ b/plugins/analysis-ukrainian/licenses/lucene-NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@ -112,6 +112,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
        .put("arabicnormalization",       MovedToAnalysisCommon.class)
        .put("arabicstem",                MovedToAnalysisCommon.class)
        .put("asciifolding",              MovedToAnalysisCommon.class)
        .put("bengalinormalization",      MovedToAnalysisCommon.class)
        .put("bengalistem",               MovedToAnalysisCommon.class)
        .put("brazilianstem",             MovedToAnalysisCommon.class)
        .put("bulgarianstem",             MovedToAnalysisCommon.class)
        .put("cjkbigram",                 MovedToAnalysisCommon.class)