LUCENE-7490: Added bengali language analyzer

This commit is contained in:
Md. Abdulla-Al-Sun 2017-08-24 18:05:22 +06:00
parent 7760b35645
commit 1bca06b8a9
15 changed files with 1135 additions and 2 deletions

View File

@ -54,13 +54,14 @@ The KStem stemmer in
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
under the BSD-license.
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
See http://members.unine.ch/jacques.savoy/clef/index.html.
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers

View File

@ -0,0 +1,132 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import java.io.IOException;
import java.io.Reader;
/**
* Analyzer for Bengali.
*/
public final class BengaliAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
/**
* File containing default Bengali stopwords.
*
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt
* The stopword list is BSD-Licensed.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the given stop words
*
* @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set
*/
public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words
*
* @param stopwords a stopword set
*/
public BengaliAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public BengaliAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Creates
* {@link TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided, {@link BengaliStemFilter}, and
* Bengali Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
result = new BengaliNormalizationFilter(result);
result = new StopFilter(result, stopwords);
result = new BengaliStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
result = new DecimalDigitFilter(result);
result = new IndicNormalizationFilter(result);
result = new BengaliNormalizationFilter(result);
return result;
}
}

View File

@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import java.io.IOException;
/**
* A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the
* orthography.
* <p>
* In some cases the normalization may cause unrelated terms to conflate, so
* to prevent terms from being normalized use an instance of
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see BengaliNormalizer
*/
public final class BengaliNormalizationFilter extends TokenFilter {
private final BengaliNormalizer normalizer = new BengaliNormalizer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
public BengaliNormalizationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword())
termAtt.setLength(normalizer.normalize(termAtt.buffer(),
termAtt.length()));
return true;
}
return false;
}
}

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
/**
* Factory for {@link BengaliNormalizationFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.BengaliNormalizationFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public BengaliNormalizationFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new BengaliNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

View File

@ -0,0 +1,155 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
/**
* Normalizer for Bengali.
* <p>
* Implements the Bengali-language specific algorithm specified in:
* <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
* Naushad UzZaman and Mumit Khan.
* http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
* </p>
*/
public class BengaliNormalizer {
/**
* Normalize an input buffer of Bengali text
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// delete Chandrabindu
case '\u0981':
len = delete(s, i, len);
i--;
break;
// DirghoI kar -> RosshoI kar
case '\u09C0':
s[i] = '\u09BF';
break;
// DirghoU kar -> RosshoU kar
case '\u09C2':
s[i] = '\u09C1';
break;
// Khio (Ka + Hoshonto + Murdorno Sh)
case '\u0995':
if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
if (i == 0) {
s[i] = '\u0996';
len = delete(s, i + 2, len);
len = delete(s, i + 1, len);
} else {
s[i+1] = '\u0996';
len = delete(s, i + 2, len);
}
}
break;
// Nga to Anusvara
case '\u0999':
s[i] = '\u0982';
break;
// Ja Phala
case '\u09AF':
if(i - 2 == 0 && s[i-1] == '\u09CD') {
s[i - 1] = '\u09C7';
if(s[i+1] == '\u09BE') {
len = delete(s, i+1, len);
}
len = delete(s, i, len);
i --;
} else {
len = delete(s, i, len);
len = delete(s, i-1, len);
i -=2;
}
break;
// Ba Phalaa
case '\u09AC':
if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
break;
if(i - 2 == 0) {
len = delete(s, i, len);
len = delete(s, i - 1, len);
i -= 2;
} else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
len = delete(s, i, len);
len = delete(s, i-1, len);
i -=2;
} else {
s[i - 1] = s[i - 2];
len = delete(s, i, len);
i --;
}
break;
// Visarga
case '\u0983':
if(i == len -1) {
if(len <= 3) {
s[i] = '\u09B9';
} else {
len = delete(s, i, len);
}
} else {
s[i] = s[i+1];
}
break;
//All sh
case '\u09B6':
case '\u09B7':
s[i] = '\u09B8';
break;
//check na
case '\u09A3':
s[i] = '\u09A8';
break;
//check ra
case '\u09DC':
case '\u09DD':
s[i] = '\u09B0';
break;
case '\u09CE':
s[i] = '\u09A4';
break;
default:
break;
}
}
return len;
}
}

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import java.io.IOException;
/**
* A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words.
*/
public final class BengaliStemFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final BengaliStemmer bengaliStemmer = new BengaliStemmer();
public BengaliStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttribute.isKeyword())
termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
/**
* Factory for {@link BengaliStemFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.BengaliStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class BengaliStemFilterFactory extends TokenFilterFactory {
public BengaliStemFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new BengaliStemFilter(input);
}
}

View File

@ -0,0 +1,183 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
/**
* Stemmer for Bengali.
* <p>
* The algorithm is based on the report in:
* <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
* P Sengupta and B B Chaudhuri
* </p>
*
* <p>
* Few Stemmer criteria are taken from:
* <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
* </p>
*/
public class BengaliStemmer {
public int stem(char buffer[], int len) {
// 8
if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
|| endsWith(buffer, len, "িতেছিলাম")
|| endsWith(buffer, len, "িতেছিলেন")
|| endsWith(buffer, len, "ইতেছিলেন")
|| endsWith(buffer, len, "িয়াছিলেন")
|| endsWith(buffer, len, "ইয়াছিলেন")
))
return len - 8;
// 7
if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
|| endsWith(buffer, len, "িতেছিলে")
|| endsWith(buffer, len, "িয়াছিলা")
|| endsWith(buffer, len, "িয়াছিলে")
|| endsWith(buffer, len, "িতেছিলা")
|| endsWith(buffer, len, "িয়াছিলি")
|| endsWith(buffer, len, "য়েদেরকে")
))
return len - 7;
// 6
if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
|| endsWith(buffer, len, "িতেছেন")
|| endsWith(buffer, len, "িয়াছিস")
|| endsWith(buffer, len, "িয়াছেন")
|| endsWith(buffer, len, "েছিলাম")
|| endsWith(buffer, len, "েছিলেন")
|| endsWith(buffer, len, "েদেরকে")
))
return len - 6;
// 5
if ((len > 6) && (endsWith(buffer, len, "িতেছি")
|| endsWith(buffer, len, "িতেছা")
|| endsWith(buffer, len, "িতেছে")
|| endsWith(buffer, len, "ছিলাম")
|| endsWith(buffer, len, "ছিলেন")
|| endsWith(buffer, len, "িয়াছি")
|| endsWith(buffer, len, "িয়াছা")
|| endsWith(buffer, len, "িয়াছে")
|| endsWith(buffer, len, "েছিলে")
|| endsWith(buffer, len, "েছিলা")
|| endsWith(buffer, len, "য়েদের")
|| endsWith(buffer, len, "দেরকে")
))
return len - 5;
// 4
if ((len > 5) && (endsWith(buffer, len, "িলাম")
|| endsWith(buffer, len, "িলেন")
|| endsWith(buffer, len, "িতাম")
|| endsWith(buffer, len, "িতেন")
|| endsWith(buffer, len, "িবেন")
|| endsWith(buffer, len, "ছিলি")
|| endsWith(buffer, len, "ছিলে")
|| endsWith(buffer, len, "ছিলা")
|| endsWith(buffer, len, "তেছে")
|| endsWith(buffer, len, "িতেছ")
|| endsWith(buffer, len, "খানা")
|| endsWith(buffer, len, "খানি")
|| endsWith(buffer, len, "গুলো")
|| endsWith(buffer, len, "গুলি")
|| endsWith(buffer, len, "য়েরা")
|| endsWith(buffer, len, "েদের")
))
return len - 4;
// 3
if ((len > 4) && (endsWith(buffer, len, "লাম")
|| endsWith(buffer, len, "িলি")
|| endsWith(buffer, len, "ইলি")
|| endsWith(buffer, len, "িলে")
|| endsWith(buffer, len, "ইলে")
|| endsWith(buffer, len, "লেন")
|| endsWith(buffer, len, "িলা")
|| endsWith(buffer, len, "ইলা")
|| endsWith(buffer, len, "তাম")
|| endsWith(buffer, len, "িতি")
|| endsWith(buffer, len, "ইতি")
|| endsWith(buffer, len, "িতে")
|| endsWith(buffer, len, "ইতে")
|| endsWith(buffer, len, "তেন")
|| endsWith(buffer, len, "িতা")
|| endsWith(buffer, len, "িবা")
|| endsWith(buffer, len, "ইবা")
|| endsWith(buffer, len, "িবি")
|| endsWith(buffer, len, "ইবি")
|| endsWith(buffer, len, "বেন")
|| endsWith(buffer, len, "িবে")
|| endsWith(buffer, len, "ইবে")
|| endsWith(buffer, len, "ছেন")
|| endsWith(buffer, len, "য়োন")
|| endsWith(buffer, len, "য়ের")
|| endsWith(buffer, len, "েরা")
|| endsWith(buffer, len, "দের")
))
return len - 3;
// 2
if ((len > 3) && (endsWith(buffer, len, "িস")
|| endsWith(buffer, len, "েন")
|| endsWith(buffer, len, "লি")
|| endsWith(buffer, len, "লে")
|| endsWith(buffer, len, "লা")
|| endsWith(buffer, len, "তি")
|| endsWith(buffer, len, "তে")
|| endsWith(buffer, len, "তা")
|| endsWith(buffer, len, "বি")
|| endsWith(buffer, len, "বে")
|| endsWith(buffer, len, "বা")
|| endsWith(buffer, len, "ছি")
|| endsWith(buffer, len, "ছা")
|| endsWith(buffer, len, "ছে")
|| endsWith(buffer, len, "ুন")
|| endsWith(buffer, len, "ুক")
|| endsWith(buffer, len, "টা")
|| endsWith(buffer, len, "টি")
|| endsWith(buffer, len, "নি")
|| endsWith(buffer, len, "ের")
|| endsWith(buffer, len, "তে")
|| endsWith(buffer, len, "রা")
|| endsWith(buffer, len, "কে")
))
return len - 2;
// 1
if ((len > 2) && (endsWith(buffer, len, "ি")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
|| endsWith(buffer, len, "")
))
return len - 1;
return len;
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Analyzer for Bengali Language.
*/
package org.apache.lucene.analysis.bn;

View File

@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory

View File

@ -0,0 +1,121 @@
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# This file was created by Jacques Savoy and is distributed under the BSD license
এই
থেকে
করে
না
ওই
এক্
নিয়ে
করা
বলেন
সঙ্গে
যে
এব
তা
আর
কোনো
বলে
সেই
দিন
হয়
কি
দু
পরে
সব
দেওয়া
মধ্যে
এর
সি
শুরু
কাজ
কিছু
কাছে
সে
তবে
বা
বন
আগে
জ্নজন
পি
পর
তো
ছিল
এখন
আমরা
প্রায়
দুই
আমাদের
তাই
অন্য
গিয়ে
প্রযন্ত
মনে
নতুন
মতো
কেখা
প্রথম
আজ
টি
ধামার
অনেক
বিভিন্ন
হাজার
জানা
নয়
অবশ্য
বেশি
এস
করে
কে
হতে
বি
কয়েক
সহ
বেশ
এমন
এমনি
কেন
কেউ
নেওয়া
চেষ্টা
লক্ষ
বলা
কারণ
আছে
শুধু
তখন
যা
এসে
চার
ছিল
যদি
আবার
কোটি
উত্তর
সামনে
উপর
বক্তব্য
এত
প্রাথমিক
উপরে
আছে
প্রতি
কাজে
যখন
খুব
বহু
গেল
পেয়্র্
চালু
নাগাদ
থাকা
পাচ
যাওয়া
রকম
সাধারণ
কমনে

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
/**
* Tests the BengaliAnalyzer
*/
public class TestBengaliAnalyzer extends BaseTokenStreamTestCase {
public void testResourcesAvailable() {
new BengaliAnalyzer().close();
}
public void testBasics() throws Exception {
Analyzer a = new BengaliAnalyzer();
checkOneTerm(a, "বাড়ী", "বার");
checkOneTerm(a, "বারী", "বার");
a.close();
}
/**
* test Digits
*/
public void testDigits() throws Exception {
BengaliAnalyzer a = new BengaliAnalyzer();
checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890");
a.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new BengaliAnalyzer();
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
analyzer.close();
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Test Bengali Filter Factory
*/
public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase {
/**
* Test IndicNormalizationFilterFactory
*/
public void testIndicNormalizer() throws Exception {
Reader reader = new StringReader("ত্‍ আমি");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("IndicNormalization").create(stream);
assertTokenStreamContents(stream, new String[] { "", "আমি" });
}
/**
* Test BengaliNormalizationFilterFactory
*/
public void testBengaliNormalizer() throws Exception {
Reader reader = new StringReader("বাড়ী");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("IndicNormalization").create(stream);
stream = tokenFilterFactory("BengaliNormalization").create(stream);
assertTokenStreamContents(stream, new String[] {"বারি"});
}
/**
* Test BengaliStemFilterFactory
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("বাড়ী");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("IndicNormalization").create(stream);
stream = tokenFilterFactory("BengaliNormalization").create(stream);
stream = tokenFilterFactory("BengaliStem").create(stream);
assertTokenStreamContents(stream, new String[] {"বার"});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}

View File

@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
/**
* Test BengaliNormalizer
*/
public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
/**
* Test some basic normalization, with an example from the paper.
*/
public void testChndrobindu() throws IOException {
check("চাঁদ", "চাদ");
}
public void testRosshoIKar() throws IOException {
check("বাড়ী", "বারি");
check("তীর", "তির");
}
public void testRosshoUKar() throws IOException {
check("ভূল", "ভুল");
check("অনূপ", "অনুপ");
}
public void testNga() throws IOException {
check("বাঙলা", "বাংলা");
}
public void testJaPhaala() throws IOException {
check("ব্যাক্তি", "বেক্তি");
check( "সন্ধ্যা", "সন্ধা");
}
public void testBaPhalaa() throws IOException {
check("স্বদেশ", "সদেস");
check("তত্ত্ব", "তত্ত");
check("বিশ্ব", "বিসস");
}
public void testVisarga() throws IOException {
check("দুঃখ", "দুখখ");
check("উঃ", "উহ");
check("পুনঃ", "পুন");
}
public void testBasics() throws IOException {
check("কণা", "কনা");
check("শরীর", "সরির");
check("বাড়ি", "বারি");
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = whitespaceMockTokenizer(input);
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
}

View File

@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
/**
* Test Codes for BengaliStemmer
*/
public class TestBengaliStemmer extends BaseTokenStreamTestCase {
/**
* Testing few verbal words
*/
public void testVerbsInShadhuForm() throws IOException {
check("করেছিলাম", "কর");
check("করিতেছিলে", "কর");
check("খাইতাম", "খাই");
check("যাইবে", "যা");
}
public void testVerbsInCholitoForm() throws IOException {
check("করছিলাম", "কর");
check("করছিলে", "কর");
check("করতাম", "কর");
check("যাব", "যা");
check("যাবে", "যা");
check("করি", "কর");
check("করো", "কর");
}
public void testNouns() throws IOException {
check("মেয়েরা", "মে");
check("মেয়েদেরকে", "মে");
check("মেয়েদের", "মে");
check("একটি", "এক");
check("মানুষগুলি", "মানুষ");
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = whitespaceMockTokenizer(input);
TokenFilter tf = new BengaliStemFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
}