LUCENE-7490: Added bengali language analyzer

2017-08-24 18:05:22 +06:00 · 2017-08-24 18:05:22 +06:00 · 1bca06b8a9
parent 7760b35645
commit 1bca06b8a9
15 changed files with 1135 additions and 2 deletions
--- a/lucene/NOTICE.txt
+++ b/lucene/NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.

-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.

 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Analyzer for Bengali.
+ */
+public final class BengaliAnalyzer extends StopwordAnalyzerBase {
+  private final CharArraySet stemExclusionSet;
+  
+  /**
+   * File containing default Bengali stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  private static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static CharArraySet getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+      } catch (IOException ex) {
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a stemming exclusion set
+   */
+  public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
+    super(stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words 
+   * 
+   * @param stopwords a stopword set
+   */
+  public BengaliAnalyzer(CharArraySet stopwords) {
+    this(stopwords, CharArraySet.EMPTY_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public BengaliAnalyzer() {
+    this(DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+
+  /**
+   * Creates
+   * {@link TokenStreamComponents}
+   * used to tokenize all the text in the provided {@link Reader}.
+   * 
+   * @return {@link TokenStreamComponents}
+   *         built from a {@link StandardTokenizer} filtered with
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
+   *         {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
+   *         if a stem exclusion set is provided, {@link BengaliStemFilter}, and
+   *         Bengali Stop words
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    final Tokenizer source = new StandardTokenizer();
+    TokenStream result = new LowerCaseFilter(source);
+    result = new DecimalDigitFilter(result);
+    if (!stemExclusionSet.isEmpty())
+      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+    result = new IndicNormalizationFilter(result);
+    result = new BengaliNormalizationFilter(result);
+    result = new StopFilter(result, stopwords);
+    result = new BengaliStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    result = new DecimalDigitFilter(result);
+    result = new IndicNormalizationFilter(result);
+    result = new BengaliNormalizationFilter(result);
+    return result;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+/**
+ * A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the
+ * orthography.
+ * <p>
+ * In some cases the normalization may cause unrelated terms to conflate, so
+ * to prevent terms from being normalized use an instance of
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see BengaliNormalizer
+ */
+public final class BengaliNormalizationFilter extends TokenFilter {
+
+  private final BengaliNormalizer normalizer = new BengaliNormalizer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  
+  public BengaliNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAtt.isKeyword())
+        termAtt.setLength(normalizer.normalize(termAtt.buffer(), 
+            termAtt.length()));
+      return true;
+    } 
+    return false;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/** 
+ * Factory for {@link BengaliNormalizationFilter}. 
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.BengaliNormalizationFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+  public BengaliNormalizationFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new BengaliNormalizationFilter(input);
+  }
+  
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import static org.apache.lucene.analysis.util.StemmerUtil.delete;
+
+/**
+ * Normalizer for Bengali.
+ * <p>
+ * Implements the Bengali-language specific algorithm specified in:
+ * <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
+ * Naushad UzZaman and Mumit Khan.
+ * http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
+ * </p>
+ */
+public class BengaliNormalizer {
+  /**
+   * Normalize an input buffer of Bengali text
+   *
+   * @param s   input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+        // delete Chandrabindu
+        case '\u0981':
+          len = delete(s, i, len);
+          i--;
+          break;
+
+        // DirghoI kar -> RosshoI kar
+        case '\u09C0':
+          s[i] = '\u09BF';
+          break;
+
+        // DirghoU kar -> RosshoU kar
+        case '\u09C2':
+          s[i] = '\u09C1';
+          break;
+
+        // Khio (Ka + Hoshonto + Murdorno Sh)
+        case '\u0995':
+          if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
+            if (i == 0) {
+              s[i] = '\u0996';
+              len = delete(s, i + 2, len);
+              len = delete(s, i + 1, len);
+            } else {
+              s[i+1] = '\u0996';
+              len = delete(s, i + 2, len);
+            }
+          }
+          break;
+
+        // Nga to Anusvara
+        case '\u0999':
+          s[i] = '\u0982';
+          break;
+
+        // Ja Phala
+        case '\u09AF':
+          if(i - 2 == 0 && s[i-1] == '\u09CD') {
+            s[i - 1] = '\u09C7';
+
+            if(s[i+1] == '\u09BE') {
+              len = delete(s, i+1, len);
+            }
+            len = delete(s, i, len);
+            i --;
+          } else {
+            len = delete(s, i, len);
+            len = delete(s, i-1, len);
+            i -=2;
+          }
+          break;
+
+        // Ba Phalaa
+        case '\u09AC':
+          if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
+            break;
+          if(i - 2 == 0) {
+            len = delete(s, i, len);
+            len = delete(s, i - 1, len);
+            i -= 2;
+          } else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
+            len = delete(s, i, len);
+            len = delete(s, i-1, len);
+            i -=2;
+          } else {
+            s[i - 1] = s[i - 2];
+            len = delete(s, i, len);
+            i --;
+          }
+          break;
+
+        // Visarga
+        case '\u0983':
+          if(i == len -1) {
+            if(len <= 3) {
+              s[i] = '\u09B9';
+            } else {
+              len = delete(s, i, len);
+            }
+          } else {
+            s[i] = s[i+1];
+          }
+          break;
+
+        //All sh
+        case '\u09B6':
+        case '\u09B7':
+          s[i] = '\u09B8';
+          break;
+
+        //check na
+        case '\u09A3':
+          s[i] = '\u09A8';
+          break;
+
+        //check ra
+        case '\u09DC':
+        case '\u09DD':
+          s[i] = '\u09B0';
+          break;
+
+        case '\u09CE':
+          s[i] = '\u09A4';
+          break;
+
+        default:
+          break;
+      }
+    }
+
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+/**
+ * A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words.
+ */
+public final class BengaliStemFilter extends TokenFilter {
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
+  private final BengaliStemmer bengaliStemmer = new BengaliStemmer();
+  
+  public BengaliStemFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttribute.isKeyword())
+        termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/** 
+ * Factory for {@link BengaliStemFilter}. 
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.BengaliStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class BengaliStemFilterFactory extends TokenFilterFactory {
+
+  public BengaliStemFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new BengaliStemFilter(input);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
+
+/**
+ * Stemmer for Bengali.
+ * <p>
+ * The algorithm is based on the report in:
+ * <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
+ * P Sengupta and B B Chaudhuri
+ * </p>
+ *
+ * <p>
+ *   Few Stemmer criteria are taken from:
+ *   <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
+ * </p>
+ */
+public class BengaliStemmer {
+  public int stem(char buffer[], int len) {
+
+    // 8
+    if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
+        || endsWith(buffer, len, "িতেছিলাম")
+        || endsWith(buffer, len, "িতেছিলেন")
+        || endsWith(buffer, len, "ইতেছিলেন")
+        || endsWith(buffer, len, "িয়াছিলেন")
+        || endsWith(buffer, len, "ইয়াছিলেন")
+    ))
+      return len - 8;
+
+    // 7
+    if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
+        || endsWith(buffer, len, "িতেছিলে")
+        || endsWith(buffer, len, "িয়াছিলা")
+        || endsWith(buffer, len, "িয়াছিলে")
+        || endsWith(buffer, len, "িতেছিলা")
+        || endsWith(buffer, len, "িয়াছিলি")
+
+        || endsWith(buffer, len, "য়েদেরকে")
+    ))
+      return len - 7;
+
+    // 6
+    if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
+        || endsWith(buffer, len, "িতেছেন")
+        || endsWith(buffer, len, "িয়াছিস")
+        || endsWith(buffer, len, "িয়াছেন")
+        || endsWith(buffer, len, "েছিলাম")
+        || endsWith(buffer, len, "েছিলেন")
+
+        || endsWith(buffer, len, "েদেরকে")
+    ))
+      return len - 6;
+
+    // 5
+    if ((len > 6) && (endsWith(buffer, len, "িতেছি")
+        || endsWith(buffer, len, "িতেছা")
+        || endsWith(buffer, len, "িতেছে")
+        || endsWith(buffer, len, "ছিলাম")
+        || endsWith(buffer, len, "ছিলেন")
+        || endsWith(buffer, len, "িয়াছি")
+        || endsWith(buffer, len, "িয়াছা")
+        || endsWith(buffer, len, "িয়াছে")
+        || endsWith(buffer, len, "েছিলে")
+        || endsWith(buffer, len, "েছিলা")
+
+        || endsWith(buffer, len, "য়েদের")
+        || endsWith(buffer, len, "দেরকে")
+    ))
+      return len - 5;
+
+    // 4
+    if ((len > 5) && (endsWith(buffer, len, "িলাম")
+        || endsWith(buffer, len, "িলেন")
+        || endsWith(buffer, len, "িতাম")
+        || endsWith(buffer, len, "িতেন")
+        || endsWith(buffer, len, "িবেন")
+        || endsWith(buffer, len, "ছিলি")
+        || endsWith(buffer, len, "ছিলে")
+        || endsWith(buffer, len, "ছিলা")
+        || endsWith(buffer, len, "তেছে")
+        || endsWith(buffer, len, "িতেছ")
+
+        || endsWith(buffer, len, "খানা")
+        || endsWith(buffer, len, "খানি")
+        || endsWith(buffer, len, "গুলো")
+        || endsWith(buffer, len, "গুলি")
+        || endsWith(buffer, len, "য়েরা")
+        || endsWith(buffer, len, "েদের")
+    ))
+      return len - 4;
+
+    // 3
+    if ((len > 4) && (endsWith(buffer, len, "লাম")
+        || endsWith(buffer, len, "িলি")
+        || endsWith(buffer, len, "ইলি")
+        || endsWith(buffer, len, "িলে")
+        || endsWith(buffer, len, "ইলে")
+        || endsWith(buffer, len, "লেন")
+        || endsWith(buffer, len, "িলা")
+        || endsWith(buffer, len, "ইলা")
+        || endsWith(buffer, len, "তাম")
+        || endsWith(buffer, len, "িতি")
+        || endsWith(buffer, len, "ইতি")
+        || endsWith(buffer, len, "িতে")
+        || endsWith(buffer, len, "ইতে")
+        || endsWith(buffer, len, "তেন")
+        || endsWith(buffer, len, "িতা")
+        || endsWith(buffer, len, "িবা")
+        || endsWith(buffer, len, "ইবা")
+        || endsWith(buffer, len, "িবি")
+        || endsWith(buffer, len, "ইবি")
+        || endsWith(buffer, len, "বেন")
+        || endsWith(buffer, len, "িবে")
+        || endsWith(buffer, len, "ইবে")
+        || endsWith(buffer, len, "ছেন")
+
+        || endsWith(buffer, len, "য়োন")
+        || endsWith(buffer, len, "য়ের")
+        || endsWith(buffer, len, "েরা")
+        || endsWith(buffer, len, "দের")
+    ))
+      return len - 3;
+
+    // 2
+    if ((len > 3) && (endsWith(buffer, len, "িস")
+        || endsWith(buffer, len, "েন")
+        || endsWith(buffer, len, "লি")
+        || endsWith(buffer, len, "লে")
+        || endsWith(buffer, len, "লা")
+        || endsWith(buffer, len, "তি")
+        || endsWith(buffer, len, "তে")
+        || endsWith(buffer, len, "তা")
+        || endsWith(buffer, len, "বি")
+        || endsWith(buffer, len, "বে")
+        || endsWith(buffer, len, "বা")
+        || endsWith(buffer, len, "ছি")
+        || endsWith(buffer, len, "ছা")
+        || endsWith(buffer, len, "ছে")
+        || endsWith(buffer, len, "ুন")
+        || endsWith(buffer, len, "ুক")
+
+        || endsWith(buffer, len, "টা")
+        || endsWith(buffer, len, "টি")
+        || endsWith(buffer, len, "নি")
+        || endsWith(buffer, len, "ের")
+        || endsWith(buffer, len, "তে")
+        || endsWith(buffer, len, "রা")
+        || endsWith(buffer, len, "কে")
+    ))
+      return len - 2;
+
+    // 1
+    if ((len > 2) && (endsWith(buffer, len, "ি")
+        || endsWith(buffer, len, "ী")
+        || endsWith(buffer, len, "া")
+        || endsWith(buffer, len, "ো")
+        || endsWith(buffer, len, "ে")
+        || endsWith(buffer, len, "ব")
+        || endsWith(buffer, len, "ত")
+    ))
+      return len - 1;
+
+    return len;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Analyzer for Bengali Language.
+ */
+package org.apache.lucene.analysis.bn;
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
 org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
 org.apache.lucene.analysis.ar.ArabicStemFilterFactory
 org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
+org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
+org.apache.lucene.analysis.bn.BengaliStemFilterFactory
 org.apache.lucene.analysis.br.BrazilianStemFilterFactory
 org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
 org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
--- a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
+++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
@ -0,0 +1,121 @@
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# This file was created by Jacques Savoy and is distributed under the BSD license
+এই
+ও
+থেকে
+করে
+এ
+না
+ওই
+এক্
+নিয়ে
+করা
+বলেন
+সঙ্গে
+যে
+এব
+তা
+আর
+কোনো
+বলে
+সেই
+দিন
+হয়
+কি
+দু
+পরে
+সব
+দেওয়া
+মধ্যে
+এর
+সি
+শুরু
+কাজ
+কিছু
+কাছে
+সে
+তবে
+বা
+বন
+আগে
+জ্নজন
+পি
+পর
+তো
+ছিল
+এখন
+আমরা
+প্রায়
+দুই
+আমাদের
+তাই
+অন্য
+গিয়ে
+প্রযন্ত
+মনে
+নতুন
+মতো
+কেখা
+প্রথম
+আজ
+টি
+ধামার
+অনেক
+বিভিন্ন
+র
+হাজার
+জানা
+নয়
+অবশ্য
+বেশি
+এস
+করে
+কে
+হতে
+বি
+কয়েক
+সহ
+বেশ
+এমন
+এমনি
+কেন
+কেউ
+নেওয়া
+চেষ্টা
+লক্ষ
+বলা
+কারণ
+আছে
+শুধু
+তখন
+যা
+এসে
+চার
+ছিল
+যদি
+আবার
+কোটি
+উত্তর
+সামনে
+উপর
+বক্তব্য
+এত
+প্রাথমিক
+উপরে
+আছে
+প্রতি
+কাজে
+যখন
+খুব
+বহু
+গেল
+পেয়্র্
+চালু
+ই
+নাগাদ
+থাকা
+পাচ
+যাওয়া
+রকম
+সাধারণ
+কমনে
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java
@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+
+
+/**
+ * Tests the BengaliAnalyzer
+ */
+public class TestBengaliAnalyzer extends BaseTokenStreamTestCase {
+
+  public void testResourcesAvailable() {
+    new BengaliAnalyzer().close();
+  }
+  
+  public void testBasics() throws Exception {
+    Analyzer a = new BengaliAnalyzer();
+
+    checkOneTerm(a, "বাড়ী", "বার");
+    checkOneTerm(a, "বারী", "বার");
+    a.close();
+  }
+  /**
+   * test Digits
+   */
+  public void testDigits() throws Exception {
+    BengaliAnalyzer a = new BengaliAnalyzer();
+    checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890");
+    a.close();
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer analyzer = new BengaliAnalyzer();
+    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+    analyzer.close();
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java
@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Test Bengali Filter Factory
+ */
+public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase {
+  /**
+   * Test IndicNormalizationFilterFactory
+   */
+  public void testIndicNormalizer() throws Exception {
+    Reader reader = new StringReader("ত্‍ আমি");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("IndicNormalization").create(stream);
+    assertTokenStreamContents(stream, new String[] { "ৎ", "আমি" });
+  }
+  
+  /**
+   * Test BengaliNormalizationFilterFactory
+   */
+  public void testBengaliNormalizer() throws Exception {
+    Reader reader = new StringReader("বাড়ী");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("IndicNormalization").create(stream);
+    stream = tokenFilterFactory("BengaliNormalization").create(stream);
+    assertTokenStreamContents(stream, new String[] {"বারি"});
+  }
+  
+  /**
+   * Test BengaliStemFilterFactory
+   */
+  public void testStemmer() throws Exception {
+    Reader reader = new StringReader("বাড়ী");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("IndicNormalization").create(stream);
+    stream = tokenFilterFactory("BengaliNormalization").create(stream);
+    stream = tokenFilterFactory("BengaliStem").create(stream);
+    assertTokenStreamContents(stream, new String[] {"বার"});
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue");
+    });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+    
+    expected = expectThrows(IllegalArgumentException.class, () -> {
+      tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue");
+    });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+    
+    expected = expectThrows(IllegalArgumentException.class, () -> {
+      tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue");
+    });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java
@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+import java.io.IOException;
+
+/**
+ * Test BengaliNormalizer
+ */
+public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
+  /**
+   * Test some basic normalization, with an example from the paper.
+   */
+  public void testChndrobindu() throws IOException {
+    check("চাঁদ", "চাদ");
+  }
+
+  public void testRosshoIKar() throws IOException {
+    check("বাড়ী", "বারি");
+    check("তীর", "তির");
+  }
+
+  public void testRosshoUKar() throws IOException {
+    check("ভূল", "ভুল");
+    check("অনূপ", "অনুপ");
+  }
+
+  public void testNga() throws IOException {
+    check("বাঙলা", "বাংলা");
+  }
+
+  public void testJaPhaala() throws IOException {
+    check("ব্যাক্তি", "বেক্তি");
+    check( "সন্ধ্যা", "সন্ধা");
+  }
+
+  public void testBaPhalaa() throws IOException {
+    check("স্বদেশ", "সদেস");
+    check("তত্ত্ব", "তত্ত");
+    check("বিশ্ব", "বিসস");
+  }
+
+  public void testVisarga() throws IOException {
+    check("দুঃখ", "দুখখ");
+    check("উঃ", "উহ");
+    check("পুনঃ", "পুন");
+  }
+
+  public void testBasics() throws IOException {
+    check("কণা", "কনা");
+    check("শরীর", "সরির");
+    check("বাড়ি", "বারি");
+  }
+
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = whitespaceMockTokenizer(input);
+    TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java
@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.bn;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+import java.io.IOException;
+
+/**
+ * Test Codes for BengaliStemmer
+ */
+public class TestBengaliStemmer extends BaseTokenStreamTestCase {
+
+  /**
+   * Testing few verbal words
+   */
+  public void testVerbsInShadhuForm() throws IOException {
+    check("করেছিলাম", "কর");
+    check("করিতেছিলে", "কর");
+    check("খাইতাম", "খাই");
+    check("যাইবে", "যা");
+  }
+
+  public void testVerbsInCholitoForm() throws IOException {
+    check("করছিলাম", "কর");
+    check("করছিলে", "কর");
+    check("করতাম", "কর");
+    check("যাব", "যা");
+    check("যাবে", "যা");
+    check("করি", "কর");
+    check("করো", "কর");
+  }
+
+  public void testNouns() throws IOException {
+    check("মেয়েরা", "মে");
+    check("মেয়েদেরকে", "মে");
+    check("মেয়েদের", "মে");
+
+    check("একটি", "এক");
+    check("মানুষগুলি", "মানুষ");
+  }
+
+  private void check(String input, String output) throws IOException {
+    Tokenizer tokenizer = whitespaceMockTokenizer(input);
+    TokenFilter tf = new BengaliStemFilter(tokenizer);
+    assertTokenStreamContents(tf, new String[] { output });
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}