LUCENE-7940: Bengali Analyzer for Lucene

Closes #238
2025-02-13 13:35:37 +00:00 · 2017-09-05 18:45:22 -04:00 · 2017-09-05 18:45:22 -04:00 · 08128f712f
commit 08128f712f
parent cc344dc6bd 40dddf9324
15 changed files with 1150 additions and 2 deletions
--- a/lucene/NOTICE.txt
+++ b/lucene/NOTICE.txt
@ -54,13 +54,14 @@ The KStem stemmer in
 was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
 under the BSD-license.
-The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
 stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
 analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
-analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
 analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
 See http://members.unine.ch/jacques.savoy/clef/index.html.
 The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java
@ -0,0 +1,132 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.in.IndicNormalizationFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import java.io.IOException;
 import java.io.Reader;
 /**
 * Analyzer for Bengali.
 */
 public final class BengaliAnalyzer extends StopwordAnalyzerBase {
  private final CharArraySet stemExclusionSet;
  /**
   * File containing default Bengali stopwords.
   * 
   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt
   * The stopword list is BSD-Licensed.
   */
  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  private static final String STOPWORDS_COMMENT = "#";
  /**
   * Returns an unmodifiable instance of the default stop-words set.
   * @return an unmodifiable instance of the default stop-words set.
   */
  public static CharArraySet getDefaultStopSet(){
    return DefaultSetHolder.DEFAULT_STOP_SET;
  }
  /**
   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
   * accesses the static final set the first time.;
   */
  private static class DefaultSetHolder {
    static final CharArraySet DEFAULT_STOP_SET;
    static {
      try {
        DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
      } catch (IOException ex) {
        throw new RuntimeException("Unable to load default stopword set");
      }
    }
  }
  /**
   * Builds an analyzer with the given stop words
   * 
   * @param stopwords a stopword set
   * @param stemExclusionSet a stemming exclusion set
   */
  public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
    super(stopwords);
    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
  }
  /**
   * Builds an analyzer with the given stop words 
   * 
   * @param stopwords a stopword set
   */
  public BengaliAnalyzer(CharArraySet stopwords) {
    this(stopwords, CharArraySet.EMPTY_SET);
  }
  /**
   * Builds an analyzer with the default stop words:
   * {@link #DEFAULT_STOPWORD_FILE}.
   */
  public BengaliAnalyzer() {
    this(DefaultSetHolder.DEFAULT_STOP_SET);
  }
  /**
   * Creates
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   * 
   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from a {@link StandardTokenizer} filtered with
   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
   *         {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
   *         if a stem exclusion set is provided, {@link BengaliStemFilter}, and
   *         Bengali Stop words
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new LowerCaseFilter(source);
    result = new DecimalDigitFilter(result);
    if (!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new IndicNormalizationFilter(result);
    result = new BengaliNormalizationFilter(result);
    result = new StopFilter(result, stopwords);
    result = new BengaliStemFilter(result);
    return new TokenStreamComponents(source, result);
  }
  @Override
  protected TokenStream normalize(String fieldName, TokenStream in) {
    TokenStream result = new StandardFilter(in);
    result = new LowerCaseFilter(result);
    result = new DecimalDigitFilter(result);
    result = new IndicNormalizationFilter(result);
    result = new BengaliNormalizationFilter(result);
    return result;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java
@ -0,0 +1,59 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import java.io.IOException;
 /**
 * A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the
 * orthography.
 * <p>
 * In some cases the normalization may cause unrelated terms to conflate, so
 * to prevent terms from being normalized use an instance of
 * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
 * the {@link KeywordAttribute} before this {@link TokenStream}.
 * </p>
 * @see BengaliNormalizer
 */
 public final class BengaliNormalizationFilter extends TokenFilter {
  private final BengaliNormalizer normalizer = new BengaliNormalizer();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
  public BengaliNormalizationFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      if (!keywordAtt.isKeyword())
        termAtt.setLength(normalizer.normalize(termAtt.buffer(), 
            termAtt.length()));
      return true;
    } 
    return false;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java
@ -0,0 +1,55 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import java.util.Map;
 /** 
 * Factory for {@link BengaliNormalizationFilter}. 
 * <pre class="prettyprint">
 * &lt;fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.BengaliNormalizationFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
  public BengaliNormalizationFilterFactory(Map<String,String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  @Override
  public TokenStream create(TokenStream input) {
    return new BengaliNormalizationFilter(input);
  }
  @Override
  public AbstractAnalysisFactory getMultiTermComponent() {
    return this;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java
@ -0,0 +1,155 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import static org.apache.lucene.analysis.util.StemmerUtil.delete;
 /**
 * Normalizer for Bengali.
 * <p>
 * Implements the Bengali-language specific algorithm specified in:
 * <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
 * Naushad UzZaman and Mumit Khan.
 * http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
 * </p>
 */
 public class BengaliNormalizer {
  /**
   * Normalize an input buffer of Bengali text
   *
   * @param s   input buffer
   * @param len length of input buffer
   * @return length of input buffer after normalization
   */
  public int normalize(char s[], int len) {
    for (int i = 0; i < len; i++) {
      switch (s[i]) {
        // delete Chandrabindu
        case '\u0981':
          len = delete(s, i, len);
          i--;
          break;
        // DirghoI kar -> RosshoI kar
        case '\u09C0':
          s[i] = '\u09BF';
          break;
        // DirghoU kar -> RosshoU kar
        case '\u09C2':
          s[i] = '\u09C1';
          break;
        // Khio (Ka + Hoshonto + Murdorno Sh)
        case '\u0995':
          if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
            if (i == 0) {
              s[i] = '\u0996';
              len = delete(s, i + 2, len);
              len = delete(s, i + 1, len);
            } else {
              s[i+1] = '\u0996';
              len = delete(s, i + 2, len);
            }
          }
          break;
        // Nga to Anusvara
        case '\u0999':
          s[i] = '\u0982';
          break;
        // Ja Phala
        case '\u09AF':
          if(i - 2 == 0 && s[i-1] == '\u09CD') {
            s[i - 1] = '\u09C7';
            if(i + 1 < len && s[i+1] == '\u09BE') {
              len = delete(s, i+1, len);
            }
            len = delete(s, i, len);
            i --;
          } else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){
            len = delete(s, i, len);
            len = delete(s, i-1, len);
            i -=2;
          }
          break;
        // Ba Phalaa
        case '\u09AC':
          if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
            break;
          if(i - 2 == 0) {
            len = delete(s, i, len);
            len = delete(s, i - 1, len);
            i -= 2;
          } else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
            len = delete(s, i, len);
            len = delete(s, i-1, len);
            i -=2;
          } else if(i - 2 >= 0){
            s[i - 1] = s[i - 2];
            len = delete(s, i, len);
            i --;
          }
          break;
        // Visarga
        case '\u0983':
          if(i == len -1) {
            if(len <= 3) {
              s[i] = '\u09B9';
            } else {
              len = delete(s, i, len);
            }
          } else {
            s[i] = s[i+1];
          }
          break;
        //All sh
        case '\u09B6':
        case '\u09B7':
          s[i] = '\u09B8';
          break;
        //check na
        case '\u09A3':
          s[i] = '\u09A8';
          break;
        //check ra
        case '\u09DC':
        case '\u09DD':
          s[i] = '\u09B0';
          break;
        case '\u09CE':
          s[i] = '\u09A4';
          break;
        default:
          break;
      }
    }
    return len;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java
@ -0,0 +1,49 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import java.io.IOException;
 /**
 * A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words.
 */
 public final class BengaliStemFilter extends TokenFilter {
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
  private final BengaliStemmer bengaliStemmer = new BengaliStemmer();
  public BengaliStemFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      if (!keywordAttribute.isKeyword())
        termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
      return true;
    } else {
      return false;
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java
@ -0,0 +1,48 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import java.util.Map;
 /** 
 * Factory for {@link BengaliStemFilter}. 
 * <pre class="prettyprint">
 * &lt;fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.BengaliStemFilterFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class BengaliStemFilterFactory extends TokenFilterFactory {
  public BengaliStemFilterFactory(Map<String,String> args) {
    super(args);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  @Override
  public TokenStream create(TokenStream input) {
    return new BengaliStemFilter(input);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java
@ -0,0 +1,183 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
 /**
 * Stemmer for Bengali.
 * <p>
 * The algorithm is based on the report in:
 * <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
 * P Sengupta and B B Chaudhuri
 * </p>
 *
 * <p>
 *   Few Stemmer criteria are taken from:
 *   <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
 * </p>
 */
 public class BengaliStemmer {
  public int stem(char buffer[], int len) {
    // 8
    if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
        || endsWith(buffer, len, "িতেছিলাম")
        || endsWith(buffer, len, "িতেছিলেন")
        || endsWith(buffer, len, "ইতেছিলেন")
        || endsWith(buffer, len, "িয়াছিলেন")
        || endsWith(buffer, len, "ইয়াছিলেন")
    ))
      return len - 8;
    // 7
    if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
        || endsWith(buffer, len, "িতেছিলে")
        || endsWith(buffer, len, "িয়াছিলা")
        || endsWith(buffer, len, "িয়াছিলে")
        || endsWith(buffer, len, "িতেছিলা")
        || endsWith(buffer, len, "িয়াছিলি")
        || endsWith(buffer, len, "য়েদেরকে")
    ))
      return len - 7;
    // 6
    if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
        || endsWith(buffer, len, "িতেছেন")
        || endsWith(buffer, len, "িয়াছিস")
        || endsWith(buffer, len, "িয়াছেন")
        || endsWith(buffer, len, "েছিলাম")
        || endsWith(buffer, len, "েছিলেন")
        || endsWith(buffer, len, "েদেরকে")
    ))
      return len - 6;
    // 5
    if ((len > 6) && (endsWith(buffer, len, "িতেছি")
        || endsWith(buffer, len, "িতেছা")
        || endsWith(buffer, len, "িতেছে")
        || endsWith(buffer, len, "ছিলাম")
        || endsWith(buffer, len, "ছিলেন")
        || endsWith(buffer, len, "িয়াছি")
        || endsWith(buffer, len, "িয়াছা")
        || endsWith(buffer, len, "িয়াছে")
        || endsWith(buffer, len, "েছিলে")
        || endsWith(buffer, len, "েছিলা")
        || endsWith(buffer, len, "য়েদের")
        || endsWith(buffer, len, "দেরকে")
    ))
      return len - 5;
    // 4
    if ((len > 5) && (endsWith(buffer, len, "িলাম")
        || endsWith(buffer, len, "িলেন")
        || endsWith(buffer, len, "িতাম")
        || endsWith(buffer, len, "িতেন")
        || endsWith(buffer, len, "িবেন")
        || endsWith(buffer, len, "ছিলি")
        || endsWith(buffer, len, "ছিলে")
        || endsWith(buffer, len, "ছিলা")
        || endsWith(buffer, len, "তেছে")
        || endsWith(buffer, len, "িতেছ")
        || endsWith(buffer, len, "খানা")
        || endsWith(buffer, len, "খানি")
        || endsWith(buffer, len, "গুলো")
        || endsWith(buffer, len, "গুলি")
        || endsWith(buffer, len, "য়েরা")
        || endsWith(buffer, len, "েদের")
    ))
      return len - 4;
    // 3
    if ((len > 4) && (endsWith(buffer, len, "লাম")
        || endsWith(buffer, len, "িলি")
        || endsWith(buffer, len, "ইলি")
        || endsWith(buffer, len, "িলে")
        || endsWith(buffer, len, "ইলে")
        || endsWith(buffer, len, "লেন")
        || endsWith(buffer, len, "িলা")
        || endsWith(buffer, len, "ইলা")
        || endsWith(buffer, len, "তাম")
        || endsWith(buffer, len, "িতি")
        || endsWith(buffer, len, "ইতি")
        || endsWith(buffer, len, "িতে")
        || endsWith(buffer, len, "ইতে")
        || endsWith(buffer, len, "তেন")
        || endsWith(buffer, len, "িতা")
        || endsWith(buffer, len, "িবা")
        || endsWith(buffer, len, "ইবা")
        || endsWith(buffer, len, "িবি")
        || endsWith(buffer, len, "ইবি")
        || endsWith(buffer, len, "বেন")
        || endsWith(buffer, len, "িবে")
        || endsWith(buffer, len, "ইবে")
        || endsWith(buffer, len, "ছেন")
        || endsWith(buffer, len, "য়োন")
        || endsWith(buffer, len, "য়ের")
        || endsWith(buffer, len, "েরা")
        || endsWith(buffer, len, "দের")
    ))
      return len - 3;
    // 2
    if ((len > 3) && (endsWith(buffer, len, "িস")
        || endsWith(buffer, len, "েন")
        || endsWith(buffer, len, "লি")
        || endsWith(buffer, len, "লে")
        || endsWith(buffer, len, "লা")
        || endsWith(buffer, len, "তি")
        || endsWith(buffer, len, "তে")
        || endsWith(buffer, len, "তা")
        || endsWith(buffer, len, "বি")
        || endsWith(buffer, len, "বে")
        || endsWith(buffer, len, "বা")
        || endsWith(buffer, len, "ছি")
        || endsWith(buffer, len, "ছা")
        || endsWith(buffer, len, "ছে")
        || endsWith(buffer, len, "ুন")
        || endsWith(buffer, len, "ুক")
        || endsWith(buffer, len, "টা")
        || endsWith(buffer, len, "টি")
        || endsWith(buffer, len, "নি")
        || endsWith(buffer, len, "ের")
        || endsWith(buffer, len, "তে")
        || endsWith(buffer, len, "রা")
        || endsWith(buffer, len, "কে")
    ))
      return len - 2;
    // 1
    if ((len > 2) && (endsWith(buffer, len, "ি")
        || endsWith(buffer, len, "ী")
        || endsWith(buffer, len, "া")
        || endsWith(buffer, len, "ো")
        || endsWith(buffer, len, "ে")
        || endsWith(buffer, len, "ব")
        || endsWith(buffer, len, "ত")
    ))
      return len - 1;
    return len;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java
@ -0,0 +1,21 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * Analyzer for Bengali Language.
 */
 package org.apache.lucene.analysis.bn;
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
 org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
 org.apache.lucene.analysis.ar.ArabicStemFilterFactory
 org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
 org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
 org.apache.lucene.analysis.bn.BengaliStemFilterFactory
 org.apache.lucene.analysis.br.BrazilianStemFilterFactory
 org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
 org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
--- a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
+++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
@ -0,0 +1,121 @@
 # See http://members.unine.ch/jacques.savoy/clef/index.html.
 # This file was created by Jacques Savoy and is distributed under the BSD license
 এই
 ও
 থেকে
 করে
 এ
 না
 ওই
 এক্
 নিয়ে
 করা
 বলেন
 সঙ্গে
 যে
 এব
 তা
 আর
 কোনো
 বলে
 সেই
 দিন
 হয়
 কি
 দু
 পরে
 সব
 দেওয়া
 মধ্যে
 এর
 সি
 শুরু
 কাজ
 কিছু
 কাছে
 সে
 তবে
 বা
 বন
 আগে
 জ্নজন
 পি
 পর
 তো
 ছিল
 এখন
 আমরা
 প্রায়
 দুই
 আমাদের
 তাই
 অন্য
 গিয়ে
 প্রযন্ত
 মনে
 নতুন
 মতো
 কেখা
 প্রথম
 আজ
 টি
 ধামার
 অনেক
 বিভিন্ন
 র
 হাজার
 জানা
 নয়
 অবশ্য
 বেশি
 এস
 করে
 কে
 হতে
 বি
 কয়েক
 সহ
 বেশ
 এমন
 এমনি
 কেন
 কেউ
 নেওয়া
 চেষ্টা
 লক্ষ
 বলা
 কারণ
 আছে
 শুধু
 তখন
 যা
 এসে
 চার
 ছিল
 যদি
 আবার
 কোটি
 উত্তর
 সামনে
 উপর
 বক্তব্য
 এত
 প্রাথমিক
 উপরে
 আছে
 প্রতি
 কাজে
 যখন
 খুব
 বহু
 গেল
 পেয়্র্
 চালু
 ই
 নাগাদ
 থাকা
 পাচ
 যাওয়া
 রকম
 সাধারণ
 কমনে
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java
@ -0,0 +1,53 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 /**
 * Tests the BengaliAnalyzer
 */
 public class TestBengaliAnalyzer extends BaseTokenStreamTestCase {
  public void testResourcesAvailable() {
    new BengaliAnalyzer().close();
  }
  public void testBasics() throws Exception {
    Analyzer a = new BengaliAnalyzer();
    checkOneTerm(a, "বাড়ী", "বার");
    checkOneTerm(a, "বারী", "বার");
    a.close();
  }
  /**
   * test Digits
   */
  public void testDigits() throws Exception {
    BengaliAnalyzer a = new BengaliAnalyzer();
    checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890");
    a.close();
  }
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer analyzer = new BengaliAnalyzer();
    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
    analyzer.close();
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java
@ -0,0 +1,80 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 /**
 * Test Bengali Filter Factory
 */
 public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase {
  /**
   * Test IndicNormalizationFilterFactory
   */
  public void testIndicNormalizer() throws Exception {
    Reader reader = new StringReader("ত্‍ আমি");
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = tokenFilterFactory("IndicNormalization").create(stream);
    assertTokenStreamContents(stream, new String[] { "ৎ", "আমি" });
  }
  /**
   * Test BengaliNormalizationFilterFactory
   */
  public void testBengaliNormalizer() throws Exception {
    Reader reader = new StringReader("বাড়ী");
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = tokenFilterFactory("IndicNormalization").create(stream);
    stream = tokenFilterFactory("BengaliNormalization").create(stream);
    assertTokenStreamContents(stream, new String[] {"বারি"});
  }
  /**
   * Test BengaliStemFilterFactory
   */
  public void testStemmer() throws Exception {
    Reader reader = new StringReader("বাড়ী");
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = tokenFilterFactory("IndicNormalization").create(stream);
    stream = tokenFilterFactory("BengaliNormalization").create(stream);
    stream = tokenFilterFactory("BengaliStem").create(stream);
    assertTokenStreamContents(stream, new String[] {"বার"});
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
      tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
    expected = expectThrows(IllegalArgumentException.class, () -> {
      tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
    expected = expectThrows(IllegalArgumentException.class, () -> {
      tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java
@ -0,0 +1,110 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.util.TestUtil;
 import java.io.IOException;
 /**
 * Test BengaliNormalizer
 */
 public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
  /**
   * Test some basic normalization, with an example from the paper.
   */
  public void testChndrobindu() throws IOException {
    check("চাঁদ", "চাদ");
  }
  public void testRosshoIKar() throws IOException {
    check("বাড়ী", "বারি");
    check("তীর", "তির");
  }
  public void testRosshoUKar() throws IOException {
    check("ভূল", "ভুল");
    check("অনূপ", "অনুপ");
  }
  public void testNga() throws IOException {
    check("বাঙলা", "বাংলা");
  }
  public void testJaPhaala() throws IOException {
    check("ব্যাক্তি", "বেক্তি");
    check( "সন্ধ্যা", "সন্ধা");
  }
  public void testBaPhalaa() throws IOException {
    check("স্বদেশ", "সদেস");
    check("তত্ত্ব", "তত্ত");
    check("বিশ্ব", "বিসস");
  }
  public void testVisarga() throws IOException {
    check("দুঃখ", "দুখখ");
    check("উঃ", "উহ");
    check("পুনঃ", "পুন");
  }
  public void testBasics() throws IOException {
    check("কণা", "কনা");
    check("শরীর", "সরির");
    check("বাড়ি", "বারি");
  }
  /** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */
  public void testRandom() throws IOException {
    BengaliNormalizer normalizer = new BengaliNormalizer();
    for (int i = 0; i < 100000; i++) {
      String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7);
      try {
        int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length());
        assertTrue(newLen >= 0); // should not return negative length
        assertTrue(newLen <= randomBengali.length()); // should not increase length of string
      } catch (Exception e) {
        System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")");
        throw e;
      }
    }
  }
  private void check(String input, String output) throws IOException {
    Tokenizer tokenizer = whitespaceMockTokenizer(input);
    TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
    assertTokenStreamContents(tf, new String[] { output });
  }
  public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer));
      }
    };
    checkOneTerm(a, "", "");
    a.close();
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java
@ -0,0 +1,79 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.bn;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import java.io.IOException;
 /**
 * Test Codes for BengaliStemmer
 */
 public class TestBengaliStemmer extends BaseTokenStreamTestCase {
  /**
   * Testing few verbal words
   */
  public void testVerbsInShadhuForm() throws IOException {
    check("করেছিলাম", "কর");
    check("করিতেছিলে", "কর");
    check("খাইতাম", "খাই");
    check("যাইবে", "যা");
  }
  public void testVerbsInCholitoForm() throws IOException {
    check("করছিলাম", "কর");
    check("করছিলে", "কর");
    check("করতাম", "কর");
    check("যাব", "যা");
    check("যাবে", "যা");
    check("করি", "কর");
    check("করো", "কর");
  }
  public void testNouns() throws IOException {
    check("মেয়েরা", "মে");
    check("মেয়েদেরকে", "মে");
    check("মেয়েদের", "মে");
    check("একটি", "এক");
    check("মানুষগুলি", "মানুষ");
  }
  private void check(String input, String output) throws IOException {
    Tokenizer tokenizer = whitespaceMockTokenizer(input);
    TokenFilter tf = new BengaliStemFilter(tokenizer);
    assertTokenStreamContents(tf, new String[] { output });
  }
  public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer));
      }
    };
    checkOneTerm(a, "", "");
    a.close();
  }
 }