diff --git a/lucene/NOTICE.txt b/lucene/NOTICE.txt index 1903adc743d..7e0c54e2995 100644 --- a/lucene/NOTICE.txt +++ b/lucene/NOTICE.txt @@ -54,13 +54,14 @@ The KStem stemmer in was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) under the BSD-license. -The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, -analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt See http://members.unine.ch/jacques.savoy/clef/index.html. The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java new file mode 100644 index 00000000000..912c4dd125c --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.core.DecimalDigitFilter; +import org.apache.lucene.analysis.in.IndicNormalizationFilter; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Analyzer for Bengali. + */ +public final class BengaliAnalyzer extends StopwordAnalyzerBase { + private final CharArraySet stemExclusionSet; + + /** + * File containing default Bengali stopwords. + * + * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt + * The stopword list is BSD-Licensed. + */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + private static final String STOPWORDS_COMMENT = "#"; + + /** + * Returns an unmodifiable instance of the default stop-words set. + * @return an unmodifiable instance of the default stop-words set. + */ + public static CharArraySet getDefaultStopSet(){ + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + private static class DefaultSetHolder { + static final CharArraySet DEFAULT_STOP_SET; + + static { + try { + DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); + } catch (IOException ex) { + throw new RuntimeException("Unable to load default stopword set"); + } + } + } + + /** + * Builds an analyzer with the given stop words + * + * @param stopwords a stopword set + * @param stemExclusionSet a stemming exclusion set + */ + public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { + super(stopwords); + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); + } + + /** + * Builds an analyzer with the given stop words + * + * @param stopwords a stopword set + */ + public BengaliAnalyzer(CharArraySet stopwords) { + this(stopwords, CharArraySet.EMPTY_SET); + } + + /** + * Builds an analyzer with the default stop words: + * {@link #DEFAULT_STOPWORD_FILE}. + */ + public BengaliAnalyzer() { + this(DefaultSetHolder.DEFAULT_STOP_SET); + } + + /** + * Creates + * {@link TokenStreamComponents} + * used to tokenize all the text in the provided {@link Reader}. + * + * @return {@link TokenStreamComponents} + * built from a {@link StandardTokenizer} filtered with + * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter}, + * {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter} + * if a stem exclusion set is provided, {@link BengaliStemFilter}, and + * Bengali Stop words + */ + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final Tokenizer source = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(source); + result = new DecimalDigitFilter(result); + if (!stemExclusionSet.isEmpty()) + result = new SetKeywordMarkerFilter(result, stemExclusionSet); + result = new IndicNormalizationFilter(result); + result = new BengaliNormalizationFilter(result); + result = new StopFilter(result, stopwords); + result = new BengaliStemFilter(result); + return new TokenStreamComponents(source, result); + } + + @Override + protected TokenStream normalize(String fieldName, TokenStream in) { + TokenStream result = new StandardFilter(in); + result = new LowerCaseFilter(result); + result = new DecimalDigitFilter(result); + result = new IndicNormalizationFilter(result); + result = new BengaliNormalizationFilter(result); + return result; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java new file mode 100644 index 00000000000..46874b5b588 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilter.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +/** + * A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the + * orthography. + *

+ * In some cases the normalization may cause unrelated terms to conflate, so + * to prevent terms from being normalized use an instance of + * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets + * the {@link KeywordAttribute} before this {@link TokenStream}. + *

+ * @see BengaliNormalizer + */ +public final class BengaliNormalizationFilter extends TokenFilter { + + private final BengaliNormalizer normalizer = new BengaliNormalizer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + + public BengaliNormalizationFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAtt.isKeyword()) + termAtt.setLength(normalizer.normalize(termAtt.buffer(), + termAtt.length())); + return true; + } + return false; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java new file mode 100644 index 00000000000..43618d6dbb3 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizationFilterFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link BengaliNormalizationFilter}. + *
+ * <fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.BengaliNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + public BengaliNormalizationFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new BengaliNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java new file mode 100644 index 00000000000..b416d1a365c --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import static org.apache.lucene.analysis.util.StemmerUtil.delete; + +/** + * Normalizer for Bengali. + *

+ * Implements the Bengali-language specific algorithm specified in: + * A Double Metaphone encoding for Bangla and its application in spelling checker + * Naushad UzZaman and Mumit Khan. + * http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf + *

+ */ +public class BengaliNormalizer { + /** + * Normalize an input buffer of Bengali text + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int normalize(char s[], int len) { + + for (int i = 0; i < len; i++) { + switch (s[i]) { + // delete Chandrabindu + case '\u0981': + len = delete(s, i, len); + i--; + break; + + // DirghoI kar -> RosshoI kar + case '\u09C0': + s[i] = '\u09BF'; + break; + + // DirghoU kar -> RosshoU kar + case '\u09C2': + s[i] = '\u09C1'; + break; + + // Khio (Ka + Hoshonto + Murdorno Sh) + case '\u0995': + if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') { + if (i == 0) { + s[i] = '\u0996'; + len = delete(s, i + 2, len); + len = delete(s, i + 1, len); + } else { + s[i+1] = '\u0996'; + len = delete(s, i + 2, len); + } + } + break; + + // Nga to Anusvara + case '\u0999': + s[i] = '\u0982'; + break; + + // Ja Phala + case '\u09AF': + if(i - 2 == 0 && s[i-1] == '\u09CD') { + s[i - 1] = '\u09C7'; + + if(s[i+1] == '\u09BE') { + len = delete(s, i+1, len); + } + len = delete(s, i, len); + i --; + } else { + len = delete(s, i, len); + len = delete(s, i-1, len); + i -=2; + } + break; + + // Ba Phalaa + case '\u09AC': + if((i >= 1 && s[i-1] != '\u09CD') || i == 0) + break; + if(i - 2 == 0) { + len = delete(s, i, len); + len = delete(s, i - 1, len); + i -= 2; + } else if(i - 5 >= 0 && s[i - 3] == '\u09CD') { + len = delete(s, i, len); + len = delete(s, i-1, len); + i -=2; + } else { + s[i - 1] = s[i - 2]; + len = delete(s, i, len); + i --; + } + break; + + // Visarga + case '\u0983': + if(i == len -1) { + if(len <= 3) { + s[i] = '\u09B9'; + } else { + len = delete(s, i, len); + } + } else { + s[i] = s[i+1]; + } + break; + + //All sh + case '\u09B6': + case '\u09B7': + s[i] = '\u09B8'; + break; + + //check na + case '\u09A3': + s[i] = '\u09A8'; + break; + + //check ra + case '\u09DC': + case '\u09DD': + s[i] = '\u09B0'; + break; + + case '\u09CE': + s[i] = '\u09A4'; + break; + + default: + break; + } + } + + return len; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java new file mode 100644 index 00000000000..97870272136 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +/** + * A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words. + */ +public final class BengaliStemFilter extends TokenFilter { + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); + private final BengaliStemmer bengaliStemmer = new BengaliStemmer(); + + public BengaliStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttribute.isKeyword()) + termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length())); + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java new file mode 100644 index 00000000000..b082d9e5b77 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemFilterFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link BengaliStemFilter}. + *
+ * <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.BengaliStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class BengaliStemFilterFactory extends TokenFilterFactory { + + public BengaliStemFilterFactory(Map args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new BengaliStemFilter(input); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java new file mode 100644 index 00000000000..8bc555a440d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliStemmer.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import static org.apache.lucene.analysis.util.StemmerUtil.endsWith; + +/** + * Stemmer for Bengali. + *

+ * The algorithm is based on the report in: + * Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis + * P Sengupta and B B Chaudhuri + *

+ * + *

+ * Few Stemmer criteria are taken from: + * http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt + *

+ */ +public class BengaliStemmer { + public int stem(char buffer[], int len) { + + // 8 + if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম") + || endsWith(buffer, len, "িতেছিলাম") + || endsWith(buffer, len, "িতেছিলেন") + || endsWith(buffer, len, "ইতেছিলেন") + || endsWith(buffer, len, "িয়াছিলেন") + || endsWith(buffer, len, "ইয়াছিলেন") + )) + return len - 8; + + // 7 + if ((len > 8) && (endsWith(buffer, len, "িতেছিলি") + || endsWith(buffer, len, "িতেছিলে") + || endsWith(buffer, len, "িয়াছিলা") + || endsWith(buffer, len, "িয়াছিলে") + || endsWith(buffer, len, "িতেছিলা") + || endsWith(buffer, len, "িয়াছিলি") + + || endsWith(buffer, len, "য়েদেরকে") + )) + return len - 7; + + // 6 + if ((len > 7) && (endsWith(buffer, len, "িতেছিস") + || endsWith(buffer, len, "িতেছেন") + || endsWith(buffer, len, "িয়াছিস") + || endsWith(buffer, len, "িয়াছেন") + || endsWith(buffer, len, "েছিলাম") + || endsWith(buffer, len, "েছিলেন") + + || endsWith(buffer, len, "েদেরকে") + )) + return len - 6; + + // 5 + if ((len > 6) && (endsWith(buffer, len, "িতেছি") + || endsWith(buffer, len, "িতেছা") + || endsWith(buffer, len, "িতেছে") + || endsWith(buffer, len, "ছিলাম") + || endsWith(buffer, len, "ছিলেন") + || endsWith(buffer, len, "িয়াছি") + || endsWith(buffer, len, "িয়াছা") + || endsWith(buffer, len, "িয়াছে") + || endsWith(buffer, len, "েছিলে") + || endsWith(buffer, len, "েছিলা") + + || endsWith(buffer, len, "য়েদের") + || endsWith(buffer, len, "দেরকে") + )) + return len - 5; + + // 4 + if ((len > 5) && (endsWith(buffer, len, "িলাম") + || endsWith(buffer, len, "িলেন") + || endsWith(buffer, len, "িতাম") + || endsWith(buffer, len, "িতেন") + || endsWith(buffer, len, "িবেন") + || endsWith(buffer, len, "ছিলি") + || endsWith(buffer, len, "ছিলে") + || endsWith(buffer, len, "ছিলা") + || endsWith(buffer, len, "তেছে") + || endsWith(buffer, len, "িতেছ") + + || endsWith(buffer, len, "খানা") + || endsWith(buffer, len, "খানি") + || endsWith(buffer, len, "গুলো") + || endsWith(buffer, len, "গুলি") + || endsWith(buffer, len, "য়েরা") + || endsWith(buffer, len, "েদের") + )) + return len - 4; + + // 3 + if ((len > 4) && (endsWith(buffer, len, "লাম") + || endsWith(buffer, len, "িলি") + || endsWith(buffer, len, "ইলি") + || endsWith(buffer, len, "িলে") + || endsWith(buffer, len, "ইলে") + || endsWith(buffer, len, "লেন") + || endsWith(buffer, len, "িলা") + || endsWith(buffer, len, "ইলা") + || endsWith(buffer, len, "তাম") + || endsWith(buffer, len, "িতি") + || endsWith(buffer, len, "ইতি") + || endsWith(buffer, len, "িতে") + || endsWith(buffer, len, "ইতে") + || endsWith(buffer, len, "তেন") + || endsWith(buffer, len, "িতা") + || endsWith(buffer, len, "িবা") + || endsWith(buffer, len, "ইবা") + || endsWith(buffer, len, "িবি") + || endsWith(buffer, len, "ইবি") + || endsWith(buffer, len, "বেন") + || endsWith(buffer, len, "িবে") + || endsWith(buffer, len, "ইবে") + || endsWith(buffer, len, "ছেন") + + || endsWith(buffer, len, "য়োন") + || endsWith(buffer, len, "য়ের") + || endsWith(buffer, len, "েরা") + || endsWith(buffer, len, "দের") + )) + return len - 3; + + // 2 + if ((len > 3) && (endsWith(buffer, len, "িস") + || endsWith(buffer, len, "েন") + || endsWith(buffer, len, "লি") + || endsWith(buffer, len, "লে") + || endsWith(buffer, len, "লা") + || endsWith(buffer, len, "তি") + || endsWith(buffer, len, "তে") + || endsWith(buffer, len, "তা") + || endsWith(buffer, len, "বি") + || endsWith(buffer, len, "বে") + || endsWith(buffer, len, "বা") + || endsWith(buffer, len, "ছি") + || endsWith(buffer, len, "ছা") + || endsWith(buffer, len, "ছে") + || endsWith(buffer, len, "ুন") + || endsWith(buffer, len, "ুক") + + || endsWith(buffer, len, "টা") + || endsWith(buffer, len, "টি") + || endsWith(buffer, len, "নি") + || endsWith(buffer, len, "ের") + || endsWith(buffer, len, "তে") + || endsWith(buffer, len, "রা") + || endsWith(buffer, len, "কে") + )) + return len - 2; + + // 1 + if ((len > 2) && (endsWith(buffer, len, "ি") + || endsWith(buffer, len, "ী") + || endsWith(buffer, len, "া") + || endsWith(buffer, len, "ো") + || endsWith(buffer, len, "ে") + || endsWith(buffer, len, "ব") + || endsWith(buffer, len, "ত") + )) + return len - 1; + + return len; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java new file mode 100644 index 00000000000..eea39a9fdfb --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Analyzer for Bengali Language. + */ +package org.apache.lucene.analysis.bn; diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index bc19c4ac320..d871ad649d1 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory org.apache.lucene.analysis.ar.ArabicStemFilterFactory org.apache.lucene.analysis.bg.BulgarianStemFilterFactory +org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory +org.apache.lucene.analysis.bn.BengaliStemFilterFactory org.apache.lucene.analysis.br.BrazilianStemFilterFactory org.apache.lucene.analysis.cjk.CJKBigramFilterFactory org.apache.lucene.analysis.cjk.CJKWidthFilterFactory diff --git a/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt new file mode 100644 index 00000000000..84d1d2ad732 --- /dev/null +++ b/lucene/analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt @@ -0,0 +1,121 @@ +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license +এই +ও +থেকে +করে +এ +না +ওই +এক্ +নিয়ে +করা +বলেন +সঙ্গে +যে +এব +তা +আর +কোনো +বলে +সেই +দিন +হয় +কি +দু +পরে +সব +দেওয়া +মধ্যে +এর +সি +শুরু +কাজ +কিছু +কাছে +সে +তবে +বা +বন +আগে +জ্নজন +পি +পর +তো +ছিল +এখন +আমরা +প্রায় +দুই +আমাদের +তাই +অন্য +গিয়ে +প্রযন্ত +মনে +নতুন +মতো +কেখা +প্রথম +আজ +টি +ধামার +অনেক +বিভিন্ন +র +হাজার +জানা +নয় +অবশ্য +বেশি +এস +করে +কে +হতে +বি +কয়েক +সহ +বেশ +এমন +এমনি +কেন +কেউ +নেওয়া +চেষ্টা +লক্ষ +বলা +কারণ +আছে +শুধু +তখন +যা +এসে +চার +ছিল +যদি +আবার +কোটি +উত্তর +সামনে +উপর +বক্তব্য +এত +প্রাথমিক +উপরে +আছে +প্রতি +কাজে +যখন +খুব +বহু +গেল +পেয়্র্ +চালু +ই +নাগাদ +থাকা +পাচ +যাওয়া +রকম +সাধারণ +কমনে \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java new file mode 100644 index 00000000000..898480a73cc --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharArraySet; + + +/** + * Tests the BengaliAnalyzer + */ +public class TestBengaliAnalyzer extends BaseTokenStreamTestCase { + + public void testResourcesAvailable() { + new BengaliAnalyzer().close(); + } + + public void testBasics() throws Exception { + Analyzer a = new BengaliAnalyzer(); + + checkOneTerm(a, "বাড়ী", "বার"); + checkOneTerm(a, "বারী", "বার"); + a.close(); + } + /** + * test Digits + */ + public void testDigits() throws Exception { + BengaliAnalyzer a = new BengaliAnalyzer(); + checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890"); + a.close(); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer analyzer = new BengaliAnalyzer(); + checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); + analyzer.close(); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java new file mode 100644 index 00000000000..3ed1a07e14f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliFilters.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +/** + * Test Bengali Filter Factory + */ +public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase { + /** + * Test IndicNormalizationFilterFactory + */ + public void testIndicNormalizer() throws Exception { + Reader reader = new StringReader("ত্‍ আমি"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("IndicNormalization").create(stream); + assertTokenStreamContents(stream, new String[] { "ৎ", "আমি" }); + } + + /** + * Test BengaliNormalizationFilterFactory + */ + public void testBengaliNormalizer() throws Exception { + Reader reader = new StringReader("বাড়ী"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("IndicNormalization").create(stream); + stream = tokenFilterFactory("BengaliNormalization").create(stream); + assertTokenStreamContents(stream, new String[] {"বারি"}); + } + + /** + * Test BengaliStemFilterFactory + */ + public void testStemmer() throws Exception { + Reader reader = new StringReader("বাড়ী"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("IndicNormalization").create(stream); + stream = tokenFilterFactory("BengaliNormalization").create(stream); + stream = tokenFilterFactory("BengaliStem").create(stream); + assertTokenStreamContents(stream, new String[] {"বার"}); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue"); + }); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java new file mode 100644 index 00000000000..ecd11ae4ba2 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +import java.io.IOException; + +/** + * Test BengaliNormalizer + */ +public class TestBengaliNormalizer extends BaseTokenStreamTestCase { + /** + * Test some basic normalization, with an example from the paper. + */ + public void testChndrobindu() throws IOException { + check("চাঁদ", "চাদ"); + } + + public void testRosshoIKar() throws IOException { + check("বাড়ী", "বারি"); + check("তীর", "তির"); + } + + public void testRosshoUKar() throws IOException { + check("ভূল", "ভুল"); + check("অনূপ", "অনুপ"); + } + + public void testNga() throws IOException { + check("বাঙলা", "বাংলা"); + } + + public void testJaPhaala() throws IOException { + check("ব্যাক্তি", "বেক্তি"); + check( "সন্ধ্যা", "সন্ধা"); + } + + public void testBaPhalaa() throws IOException { + check("স্বদেশ", "সদেস"); + check("তত্ত্ব", "তত্ত"); + check("বিশ্ব", "বিসস"); + } + + public void testVisarga() throws IOException { + check("দুঃখ", "দুখখ"); + check("উঃ", "উহ"); + check("পুনঃ", "পুন"); + } + + public void testBasics() throws IOException { + check("কণা", "কনা"); + check("শরীর", "সরির"); + check("বাড়ি", "বারি"); + } + + private void check(String input, String output) throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(input); + TokenFilter tf = new BengaliNormalizationFilter(tokenizer); + assertTokenStreamContents(tf, new String[] { output }); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java new file mode 100644 index 00000000000..4f7617236f3 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliStemmer.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.bn; + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +import java.io.IOException; + +/** + * Test Codes for BengaliStemmer + */ +public class TestBengaliStemmer extends BaseTokenStreamTestCase { + + /** + * Testing few verbal words + */ + public void testVerbsInShadhuForm() throws IOException { + check("করেছিলাম", "কর"); + check("করিতেছিলে", "কর"); + check("খাইতাম", "খাই"); + check("যাইবে", "যা"); + } + + public void testVerbsInCholitoForm() throws IOException { + check("করছিলাম", "কর"); + check("করছিলে", "কর"); + check("করতাম", "কর"); + check("যাব", "যা"); + check("যাবে", "যা"); + check("করি", "কর"); + check("করো", "কর"); + } + + public void testNouns() throws IOException { + check("মেয়েরা", "মে"); + check("মেয়েদেরকে", "মে"); + check("মেয়েদের", "মে"); + + check("একটি", "এক"); + check("মানুষগুলি", "মানুষ"); + } + + private void check(String input, String output) throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(input); + TokenFilter tf = new BengaliStemFilter(tokenizer); + assertTokenStreamContents(tf, new String[] { output }); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } +}