mirror of https://github.com/apache/lucene.git
LUCENE-7490: Added bengali language analyzer
This commit is contained in:
parent
7760b35645
commit
1bca06b8a9
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Analyzer for Bengali.
|
||||
*/
|
||||
public final class BengaliAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* File containing default Bengali stopwords.
|
||||
*
|
||||
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt
|
||||
* The stopword list is BSD-Licensed.
|
||||
*/
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
private static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a stemming exclusion set
|
||||
*/
|
||||
public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public BengaliAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public BengaliAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates
|
||||
* {@link TokenStreamComponents}
|
||||
* used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents}
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
|
||||
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
||||
* if a stem exclusion set is provided, {@link BengaliStemFilter}, and
|
||||
* Bengali Stop words
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
result = new DecimalDigitFilter(result);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new BengaliNormalizationFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
result = new BengaliStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new DecimalDigitFilter(result);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new BengaliNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the
|
||||
* orthography.
|
||||
* <p>
|
||||
* In some cases the normalization may cause unrelated terms to conflate, so
|
||||
* to prevent terms from being normalized use an instance of
|
||||
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see BengaliNormalizer
|
||||
*/
|
||||
public final class BengaliNormalizationFilter extends TokenFilter {
|
||||
|
||||
private final BengaliNormalizer normalizer = new BengaliNormalizer();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public BengaliNormalizationFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAtt.isKeyword())
|
||||
termAtt.setLength(normalizer.normalize(termAtt.buffer(),
|
||||
termAtt.length()));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link BengaliNormalizationFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.BengaliNormalizationFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public BengaliNormalizationFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new BengaliNormalizationFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
|
||||
|
||||
/**
|
||||
* Normalizer for Bengali.
|
||||
* <p>
|
||||
* Implements the Bengali-language specific algorithm specified in:
|
||||
* <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
|
||||
* Naushad UzZaman and Mumit Khan.
|
||||
* http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
|
||||
* </p>
|
||||
*/
|
||||
public class BengaliNormalizer {
|
||||
/**
|
||||
* Normalize an input buffer of Bengali text
|
||||
*
|
||||
* @param s input buffer
|
||||
* @param len length of input buffer
|
||||
* @return length of input buffer after normalization
|
||||
*/
|
||||
public int normalize(char s[], int len) {
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// delete Chandrabindu
|
||||
case '\u0981':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// DirghoI kar -> RosshoI kar
|
||||
case '\u09C0':
|
||||
s[i] = '\u09BF';
|
||||
break;
|
||||
|
||||
// DirghoU kar -> RosshoU kar
|
||||
case '\u09C2':
|
||||
s[i] = '\u09C1';
|
||||
break;
|
||||
|
||||
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||
case '\u0995':
|
||||
if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
|
||||
if (i == 0) {
|
||||
s[i] = '\u0996';
|
||||
len = delete(s, i + 2, len);
|
||||
len = delete(s, i + 1, len);
|
||||
} else {
|
||||
s[i+1] = '\u0996';
|
||||
len = delete(s, i + 2, len);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// Nga to Anusvara
|
||||
case '\u0999':
|
||||
s[i] = '\u0982';
|
||||
break;
|
||||
|
||||
// Ja Phala
|
||||
case '\u09AF':
|
||||
if(i - 2 == 0 && s[i-1] == '\u09CD') {
|
||||
s[i - 1] = '\u09C7';
|
||||
|
||||
if(s[i+1] == '\u09BE') {
|
||||
len = delete(s, i+1, len);
|
||||
}
|
||||
len = delete(s, i, len);
|
||||
i --;
|
||||
} else {
|
||||
len = delete(s, i, len);
|
||||
len = delete(s, i-1, len);
|
||||
i -=2;
|
||||
}
|
||||
break;
|
||||
|
||||
// Ba Phalaa
|
||||
case '\u09AC':
|
||||
if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
|
||||
break;
|
||||
if(i - 2 == 0) {
|
||||
len = delete(s, i, len);
|
||||
len = delete(s, i - 1, len);
|
||||
i -= 2;
|
||||
} else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
|
||||
len = delete(s, i, len);
|
||||
len = delete(s, i-1, len);
|
||||
i -=2;
|
||||
} else {
|
||||
s[i - 1] = s[i - 2];
|
||||
len = delete(s, i, len);
|
||||
i --;
|
||||
}
|
||||
break;
|
||||
|
||||
// Visarga
|
||||
case '\u0983':
|
||||
if(i == len -1) {
|
||||
if(len <= 3) {
|
||||
s[i] = '\u09B9';
|
||||
} else {
|
||||
len = delete(s, i, len);
|
||||
}
|
||||
} else {
|
||||
s[i] = s[i+1];
|
||||
}
|
||||
break;
|
||||
|
||||
//All sh
|
||||
case '\u09B6':
|
||||
case '\u09B7':
|
||||
s[i] = '\u09B8';
|
||||
break;
|
||||
|
||||
//check na
|
||||
case '\u09A3':
|
||||
s[i] = '\u09A8';
|
||||
break;
|
||||
|
||||
//check ra
|
||||
case '\u09DC':
|
||||
case '\u09DD':
|
||||
s[i] = '\u09B0';
|
||||
break;
|
||||
|
||||
case '\u09CE':
|
||||
s[i] = '\u09A4';
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words.
|
||||
*/
|
||||
public final class BengaliStemFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||
private final BengaliStemmer bengaliStemmer = new BengaliStemmer();
|
||||
|
||||
public BengaliStemFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (!keywordAttribute.isKeyword())
|
||||
termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link BengaliStemFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.BengaliStemFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class BengaliStemFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public BengaliStemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new BengaliStemFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
|
||||
|
||||
/**
|
||||
* Stemmer for Bengali.
|
||||
* <p>
|
||||
* The algorithm is based on the report in:
|
||||
* <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
|
||||
* P Sengupta and B B Chaudhuri
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Few Stemmer criteria are taken from:
|
||||
* <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
|
||||
* </p>
|
||||
*/
|
||||
public class BengaliStemmer {
|
||||
public int stem(char buffer[], int len) {
|
||||
|
||||
// 8
|
||||
if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
|
||||
|| endsWith(buffer, len, "িতেছিলাম")
|
||||
|| endsWith(buffer, len, "িতেছিলেন")
|
||||
|| endsWith(buffer, len, "ইতেছিলেন")
|
||||
|| endsWith(buffer, len, "িয়াছিলেন")
|
||||
|| endsWith(buffer, len, "ইয়াছিলেন")
|
||||
))
|
||||
return len - 8;
|
||||
|
||||
// 7
|
||||
if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
|
||||
|| endsWith(buffer, len, "িতেছিলে")
|
||||
|| endsWith(buffer, len, "িয়াছিলা")
|
||||
|| endsWith(buffer, len, "িয়াছিলে")
|
||||
|| endsWith(buffer, len, "িতেছিলা")
|
||||
|| endsWith(buffer, len, "িয়াছিলি")
|
||||
|
||||
|| endsWith(buffer, len, "য়েদেরকে")
|
||||
))
|
||||
return len - 7;
|
||||
|
||||
// 6
|
||||
if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
|
||||
|| endsWith(buffer, len, "িতেছেন")
|
||||
|| endsWith(buffer, len, "িয়াছিস")
|
||||
|| endsWith(buffer, len, "িয়াছেন")
|
||||
|| endsWith(buffer, len, "েছিলাম")
|
||||
|| endsWith(buffer, len, "েছিলেন")
|
||||
|
||||
|| endsWith(buffer, len, "েদেরকে")
|
||||
))
|
||||
return len - 6;
|
||||
|
||||
// 5
|
||||
if ((len > 6) && (endsWith(buffer, len, "িতেছি")
|
||||
|| endsWith(buffer, len, "িতেছা")
|
||||
|| endsWith(buffer, len, "িতেছে")
|
||||
|| endsWith(buffer, len, "ছিলাম")
|
||||
|| endsWith(buffer, len, "ছিলেন")
|
||||
|| endsWith(buffer, len, "িয়াছি")
|
||||
|| endsWith(buffer, len, "িয়াছা")
|
||||
|| endsWith(buffer, len, "িয়াছে")
|
||||
|| endsWith(buffer, len, "েছিলে")
|
||||
|| endsWith(buffer, len, "েছিলা")
|
||||
|
||||
|| endsWith(buffer, len, "য়েদের")
|
||||
|| endsWith(buffer, len, "দেরকে")
|
||||
))
|
||||
return len - 5;
|
||||
|
||||
// 4
|
||||
if ((len > 5) && (endsWith(buffer, len, "িলাম")
|
||||
|| endsWith(buffer, len, "িলেন")
|
||||
|| endsWith(buffer, len, "িতাম")
|
||||
|| endsWith(buffer, len, "িতেন")
|
||||
|| endsWith(buffer, len, "িবেন")
|
||||
|| endsWith(buffer, len, "ছিলি")
|
||||
|| endsWith(buffer, len, "ছিলে")
|
||||
|| endsWith(buffer, len, "ছিলা")
|
||||
|| endsWith(buffer, len, "তেছে")
|
||||
|| endsWith(buffer, len, "িতেছ")
|
||||
|
||||
|| endsWith(buffer, len, "খানা")
|
||||
|| endsWith(buffer, len, "খানি")
|
||||
|| endsWith(buffer, len, "গুলো")
|
||||
|| endsWith(buffer, len, "গুলি")
|
||||
|| endsWith(buffer, len, "য়েরা")
|
||||
|| endsWith(buffer, len, "েদের")
|
||||
))
|
||||
return len - 4;
|
||||
|
||||
// 3
|
||||
if ((len > 4) && (endsWith(buffer, len, "লাম")
|
||||
|| endsWith(buffer, len, "িলি")
|
||||
|| endsWith(buffer, len, "ইলি")
|
||||
|| endsWith(buffer, len, "িলে")
|
||||
|| endsWith(buffer, len, "ইলে")
|
||||
|| endsWith(buffer, len, "লেন")
|
||||
|| endsWith(buffer, len, "িলা")
|
||||
|| endsWith(buffer, len, "ইলা")
|
||||
|| endsWith(buffer, len, "তাম")
|
||||
|| endsWith(buffer, len, "িতি")
|
||||
|| endsWith(buffer, len, "ইতি")
|
||||
|| endsWith(buffer, len, "িতে")
|
||||
|| endsWith(buffer, len, "ইতে")
|
||||
|| endsWith(buffer, len, "তেন")
|
||||
|| endsWith(buffer, len, "িতা")
|
||||
|| endsWith(buffer, len, "িবা")
|
||||
|| endsWith(buffer, len, "ইবা")
|
||||
|| endsWith(buffer, len, "িবি")
|
||||
|| endsWith(buffer, len, "ইবি")
|
||||
|| endsWith(buffer, len, "বেন")
|
||||
|| endsWith(buffer, len, "িবে")
|
||||
|| endsWith(buffer, len, "ইবে")
|
||||
|| endsWith(buffer, len, "ছেন")
|
||||
|
||||
|| endsWith(buffer, len, "য়োন")
|
||||
|| endsWith(buffer, len, "য়ের")
|
||||
|| endsWith(buffer, len, "েরা")
|
||||
|| endsWith(buffer, len, "দের")
|
||||
))
|
||||
return len - 3;
|
||||
|
||||
// 2
|
||||
if ((len > 3) && (endsWith(buffer, len, "িস")
|
||||
|| endsWith(buffer, len, "েন")
|
||||
|| endsWith(buffer, len, "লি")
|
||||
|| endsWith(buffer, len, "লে")
|
||||
|| endsWith(buffer, len, "লা")
|
||||
|| endsWith(buffer, len, "তি")
|
||||
|| endsWith(buffer, len, "তে")
|
||||
|| endsWith(buffer, len, "তা")
|
||||
|| endsWith(buffer, len, "বি")
|
||||
|| endsWith(buffer, len, "বে")
|
||||
|| endsWith(buffer, len, "বা")
|
||||
|| endsWith(buffer, len, "ছি")
|
||||
|| endsWith(buffer, len, "ছা")
|
||||
|| endsWith(buffer, len, "ছে")
|
||||
|| endsWith(buffer, len, "ুন")
|
||||
|| endsWith(buffer, len, "ুক")
|
||||
|
||||
|| endsWith(buffer, len, "টা")
|
||||
|| endsWith(buffer, len, "টি")
|
||||
|| endsWith(buffer, len, "নি")
|
||||
|| endsWith(buffer, len, "ের")
|
||||
|| endsWith(buffer, len, "তে")
|
||||
|| endsWith(buffer, len, "রা")
|
||||
|| endsWith(buffer, len, "কে")
|
||||
))
|
||||
return len - 2;
|
||||
|
||||
// 1
|
||||
if ((len > 2) && (endsWith(buffer, len, "ি")
|
||||
|| endsWith(buffer, len, "ী")
|
||||
|| endsWith(buffer, len, "া")
|
||||
|| endsWith(buffer, len, "ো")
|
||||
|| endsWith(buffer, len, "ে")
|
||||
|| endsWith(buffer, len, "ব")
|
||||
|| endsWith(buffer, len, "ত")
|
||||
))
|
||||
return len - 1;
|
||||
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Analyzer for Bengali Language.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
|
@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
|
|||
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
|
||||
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
||||
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
|
||||
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
||||
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
|
||||
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license
|
||||
এই
|
||||
ও
|
||||
থেকে
|
||||
করে
|
||||
এ
|
||||
না
|
||||
ওই
|
||||
এক্
|
||||
নিয়ে
|
||||
করা
|
||||
বলেন
|
||||
সঙ্গে
|
||||
যে
|
||||
এব
|
||||
তা
|
||||
আর
|
||||
কোনো
|
||||
বলে
|
||||
সেই
|
||||
দিন
|
||||
হয়
|
||||
কি
|
||||
দু
|
||||
পরে
|
||||
সব
|
||||
দেওয়া
|
||||
মধ্যে
|
||||
এর
|
||||
সি
|
||||
শুরু
|
||||
কাজ
|
||||
কিছু
|
||||
কাছে
|
||||
সে
|
||||
তবে
|
||||
বা
|
||||
বন
|
||||
আগে
|
||||
জ্নজন
|
||||
পি
|
||||
পর
|
||||
তো
|
||||
ছিল
|
||||
এখন
|
||||
আমরা
|
||||
প্রায়
|
||||
দুই
|
||||
আমাদের
|
||||
তাই
|
||||
অন্য
|
||||
গিয়ে
|
||||
প্রযন্ত
|
||||
মনে
|
||||
নতুন
|
||||
মতো
|
||||
কেখা
|
||||
প্রথম
|
||||
আজ
|
||||
টি
|
||||
ধামার
|
||||
অনেক
|
||||
বিভিন্ন
|
||||
র
|
||||
হাজার
|
||||
জানা
|
||||
নয়
|
||||
অবশ্য
|
||||
বেশি
|
||||
এস
|
||||
করে
|
||||
কে
|
||||
হতে
|
||||
বি
|
||||
কয়েক
|
||||
সহ
|
||||
বেশ
|
||||
এমন
|
||||
এমনি
|
||||
কেন
|
||||
কেউ
|
||||
নেওয়া
|
||||
চেষ্টা
|
||||
লক্ষ
|
||||
বলা
|
||||
কারণ
|
||||
আছে
|
||||
শুধু
|
||||
তখন
|
||||
যা
|
||||
এসে
|
||||
চার
|
||||
ছিল
|
||||
যদি
|
||||
আবার
|
||||
কোটি
|
||||
উত্তর
|
||||
সামনে
|
||||
উপর
|
||||
বক্তব্য
|
||||
এত
|
||||
প্রাথমিক
|
||||
উপরে
|
||||
আছে
|
||||
প্রতি
|
||||
কাজে
|
||||
যখন
|
||||
খুব
|
||||
বহু
|
||||
গেল
|
||||
পেয়্র্
|
||||
চালু
|
||||
ই
|
||||
নাগাদ
|
||||
থাকা
|
||||
পাচ
|
||||
যাওয়া
|
||||
রকম
|
||||
সাধারণ
|
||||
কমনে
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
|
||||
/**
|
||||
* Tests the BengaliAnalyzer
|
||||
*/
|
||||
public class TestBengaliAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testResourcesAvailable() {
|
||||
new BengaliAnalyzer().close();
|
||||
}
|
||||
|
||||
public void testBasics() throws Exception {
|
||||
Analyzer a = new BengaliAnalyzer();
|
||||
|
||||
checkOneTerm(a, "বাড়ী", "বার");
|
||||
checkOneTerm(a, "বারী", "বার");
|
||||
a.close();
|
||||
}
|
||||
/**
|
||||
* test Digits
|
||||
*/
|
||||
public void testDigits() throws Exception {
|
||||
BengaliAnalyzer a = new BengaliAnalyzer();
|
||||
checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer analyzer = new BengaliAnalyzer();
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
analyzer.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
/**
|
||||
* Test Bengali Filter Factory
|
||||
*/
|
||||
public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase {
|
||||
/**
|
||||
* Test IndicNormalizationFilterFactory
|
||||
*/
|
||||
public void testIndicNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("ত্ আমি");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "ৎ", "আমি" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test BengaliNormalizationFilterFactory
|
||||
*/
|
||||
public void testBengaliNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("বাড়ী");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||
stream = tokenFilterFactory("BengaliNormalization").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"বারি"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test BengaliStemFilterFactory
|
||||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("বাড়ী");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||
stream = tokenFilterFactory("BengaliNormalization").create(stream);
|
||||
stream = tokenFilterFactory("BengaliStem").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"বার"});
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
|
||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
|
||||
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Test BengaliNormalizer
|
||||
*/
|
||||
public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* Test some basic normalization, with an example from the paper.
|
||||
*/
|
||||
public void testChndrobindu() throws IOException {
|
||||
check("চাঁদ", "চাদ");
|
||||
}
|
||||
|
||||
public void testRosshoIKar() throws IOException {
|
||||
check("বাড়ী", "বারি");
|
||||
check("তীর", "তির");
|
||||
}
|
||||
|
||||
public void testRosshoUKar() throws IOException {
|
||||
check("ভূল", "ভুল");
|
||||
check("অনূপ", "অনুপ");
|
||||
}
|
||||
|
||||
public void testNga() throws IOException {
|
||||
check("বাঙলা", "বাংলা");
|
||||
}
|
||||
|
||||
public void testJaPhaala() throws IOException {
|
||||
check("ব্যাক্তি", "বেক্তি");
|
||||
check( "সন্ধ্যা", "সন্ধা");
|
||||
}
|
||||
|
||||
public void testBaPhalaa() throws IOException {
|
||||
check("স্বদেশ", "সদেস");
|
||||
check("তত্ত্ব", "তত্ত");
|
||||
check("বিশ্ব", "বিসস");
|
||||
}
|
||||
|
||||
public void testVisarga() throws IOException {
|
||||
check("দুঃখ", "দুখখ");
|
||||
check("উঃ", "উহ");
|
||||
check("পুনঃ", "পুন");
|
||||
}
|
||||
|
||||
public void testBasics() throws IOException {
|
||||
check("কণা", "কনা");
|
||||
check("শরীর", "সরির");
|
||||
check("বাড়ি", "বারি");
|
||||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
||||
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Test Codes for BengaliStemmer
|
||||
*/
|
||||
public class TestBengaliStemmer extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Testing few verbal words
|
||||
*/
|
||||
public void testVerbsInShadhuForm() throws IOException {
|
||||
check("করেছিলাম", "কর");
|
||||
check("করিতেছিলে", "কর");
|
||||
check("খাইতাম", "খাই");
|
||||
check("যাইবে", "যা");
|
||||
}
|
||||
|
||||
public void testVerbsInCholitoForm() throws IOException {
|
||||
check("করছিলাম", "কর");
|
||||
check("করছিলে", "কর");
|
||||
check("করতাম", "কর");
|
||||
check("যাব", "যা");
|
||||
check("যাবে", "যা");
|
||||
check("করি", "কর");
|
||||
check("করো", "কর");
|
||||
}
|
||||
|
||||
public void testNouns() throws IOException {
|
||||
check("মেয়েরা", "মে");
|
||||
check("মেয়েদেরকে", "মে");
|
||||
check("মেয়েদের", "মে");
|
||||
|
||||
check("একটি", "এক");
|
||||
check("মানুষগুলি", "মানুষ");
|
||||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
||||
TokenFilter tf = new BengaliStemFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] { output });
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
a.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue