mirror of
https://github.com/apache/lucene.git
synced 2025-02-13 13:35:37 +00:00
commit
08128f712f
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||||||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||||
under the BSD-license.
|
under the BSD-license.
|
||||||
|
|
||||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||||
|
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
|
@ -0,0 +1,132 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||||
|
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Bengali.
|
||||||
|
*/
|
||||||
|
public final class BengaliAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File containing default Bengali stopwords.
|
||||||
|
*
|
||||||
|
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/bengaliST.txt
|
||||||
|
* The stopword list is BSD-Licensed.
|
||||||
|
*/
|
||||||
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
private static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static CharArraySet getDefaultStopSet(){
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||||
|
* accesses the static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET = loadStopwordSet(false, BengaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
throw new RuntimeException("Unable to load default stopword set");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a stemming exclusion set
|
||||||
|
*/
|
||||||
|
public BengaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
|
super(stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public BengaliAnalyzer(CharArraySet stopwords) {
|
||||||
|
this(stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the default stop words:
|
||||||
|
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||||
|
*/
|
||||||
|
public BengaliAnalyzer() {
|
||||||
|
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates
|
||||||
|
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
|
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
|
||||||
|
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
||||||
|
* if a stem exclusion set is provided, {@link BengaliStemFilter}, and
|
||||||
|
* Bengali Stop words
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
final Tokenizer source = new StandardTokenizer();
|
||||||
|
TokenStream result = new LowerCaseFilter(source);
|
||||||
|
result = new DecimalDigitFilter(result);
|
||||||
|
if (!stemExclusionSet.isEmpty())
|
||||||
|
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
result = new BengaliNormalizationFilter(result);
|
||||||
|
result = new StopFilter(result, stopwords);
|
||||||
|
result = new BengaliStemFilter(result);
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
|
TokenStream result = new StandardFilter(in);
|
||||||
|
result = new LowerCaseFilter(result);
|
||||||
|
result = new DecimalDigitFilter(result);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
result = new BengaliNormalizationFilter(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link BengaliNormalizer} to normalize the
|
||||||
|
* orthography.
|
||||||
|
* <p>
|
||||||
|
* In some cases the normalization may cause unrelated terms to conflate, so
|
||||||
|
* to prevent terms from being normalized use an instance of
|
||||||
|
* {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
* @see BengaliNormalizer
|
||||||
|
*/
|
||||||
|
public final class BengaliNormalizationFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final BengaliNormalizer normalizer = new BengaliNormalizer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public BengaliNormalizationFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAtt.isKeyword())
|
||||||
|
termAtt.setLength(normalizer.normalize(termAtt.buffer(),
|
||||||
|
termAtt.length()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,55 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||||
|
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link BengaliNormalizationFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_bnnormal" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.BengaliNormalizationFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*/
|
||||||
|
public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||||
|
|
||||||
|
public BengaliNormalizationFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new BengaliNormalizationFilter(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,155 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizer for Bengali.
|
||||||
|
* <p>
|
||||||
|
* Implements the Bengali-language specific algorithm specified in:
|
||||||
|
* <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
|
||||||
|
* Naushad UzZaman and Mumit Khan.
|
||||||
|
* http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class BengaliNormalizer {
|
||||||
|
/**
|
||||||
|
* Normalize an input buffer of Bengali text
|
||||||
|
*
|
||||||
|
* @param s input buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after normalization
|
||||||
|
*/
|
||||||
|
public int normalize(char s[], int len) {
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
switch (s[i]) {
|
||||||
|
// delete Chandrabindu
|
||||||
|
case '\u0981':
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i--;
|
||||||
|
break;
|
||||||
|
|
||||||
|
// DirghoI kar -> RosshoI kar
|
||||||
|
case '\u09C0':
|
||||||
|
s[i] = '\u09BF';
|
||||||
|
break;
|
||||||
|
|
||||||
|
// DirghoU kar -> RosshoU kar
|
||||||
|
case '\u09C2':
|
||||||
|
s[i] = '\u09C1';
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||||
|
case '\u0995':
|
||||||
|
if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
|
||||||
|
if (i == 0) {
|
||||||
|
s[i] = '\u0996';
|
||||||
|
len = delete(s, i + 2, len);
|
||||||
|
len = delete(s, i + 1, len);
|
||||||
|
} else {
|
||||||
|
s[i+1] = '\u0996';
|
||||||
|
len = delete(s, i + 2, len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Nga to Anusvara
|
||||||
|
case '\u0999':
|
||||||
|
s[i] = '\u0982';
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Ja Phala
|
||||||
|
case '\u09AF':
|
||||||
|
if(i - 2 == 0 && s[i-1] == '\u09CD') {
|
||||||
|
s[i - 1] = '\u09C7';
|
||||||
|
|
||||||
|
if(i + 1 < len && s[i+1] == '\u09BE') {
|
||||||
|
len = delete(s, i+1, len);
|
||||||
|
}
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i --;
|
||||||
|
} else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){
|
||||||
|
len = delete(s, i, len);
|
||||||
|
len = delete(s, i-1, len);
|
||||||
|
i -=2;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Ba Phalaa
|
||||||
|
case '\u09AC':
|
||||||
|
if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
|
||||||
|
break;
|
||||||
|
if(i - 2 == 0) {
|
||||||
|
len = delete(s, i, len);
|
||||||
|
len = delete(s, i - 1, len);
|
||||||
|
i -= 2;
|
||||||
|
} else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
|
||||||
|
len = delete(s, i, len);
|
||||||
|
len = delete(s, i-1, len);
|
||||||
|
i -=2;
|
||||||
|
} else if(i - 2 >= 0){
|
||||||
|
s[i - 1] = s[i - 2];
|
||||||
|
len = delete(s, i, len);
|
||||||
|
i --;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Visarga
|
||||||
|
case '\u0983':
|
||||||
|
if(i == len -1) {
|
||||||
|
if(len <= 3) {
|
||||||
|
s[i] = '\u09B9';
|
||||||
|
} else {
|
||||||
|
len = delete(s, i, len);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
s[i] = s[i+1];
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
//All sh
|
||||||
|
case '\u09B6':
|
||||||
|
case '\u09B7':
|
||||||
|
s[i] = '\u09B8';
|
||||||
|
break;
|
||||||
|
|
||||||
|
//check na
|
||||||
|
case '\u09A3':
|
||||||
|
s[i] = '\u09A8';
|
||||||
|
break;
|
||||||
|
|
||||||
|
//check ra
|
||||||
|
case '\u09DC':
|
||||||
|
case '\u09DD':
|
||||||
|
s[i] = '\u09B0';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u09CE':
|
||||||
|
s[i] = '\u09A4';
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link BengaliStemmer} to stem Bengali words.
|
||||||
|
*/
|
||||||
|
public final class BengaliStemFilter extends TokenFilter {
|
||||||
|
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||||
|
private final BengaliStemmer bengaliStemmer = new BengaliStemmer();
|
||||||
|
|
||||||
|
public BengaliStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttribute.isKeyword())
|
||||||
|
termAttribute.setLength(bengaliStemmer.stem(termAttribute.buffer(), termAttribute.length()));
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link BengaliStemFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
* <filter class="solr.BengaliStemFilterFactory"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*/
|
||||||
|
public class BengaliStemFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
public BengaliStemFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new BengaliStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,183 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stemmer for Bengali.
|
||||||
|
* <p>
|
||||||
|
* The algorithm is based on the report in:
|
||||||
|
* <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
|
||||||
|
* P Sengupta and B B Chaudhuri
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Few Stemmer criteria are taken from:
|
||||||
|
* <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class BengaliStemmer {
|
||||||
|
public int stem(char buffer[], int len) {
|
||||||
|
|
||||||
|
// 8
|
||||||
|
if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
|
||||||
|
|| endsWith(buffer, len, "িতেছিলাম")
|
||||||
|
|| endsWith(buffer, len, "িতেছিলেন")
|
||||||
|
|| endsWith(buffer, len, "ইতেছিলেন")
|
||||||
|
|| endsWith(buffer, len, "িয়াছিলেন")
|
||||||
|
|| endsWith(buffer, len, "ইয়াছিলেন")
|
||||||
|
))
|
||||||
|
return len - 8;
|
||||||
|
|
||||||
|
// 7
|
||||||
|
if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
|
||||||
|
|| endsWith(buffer, len, "িতেছিলে")
|
||||||
|
|| endsWith(buffer, len, "িয়াছিলা")
|
||||||
|
|| endsWith(buffer, len, "িয়াছিলে")
|
||||||
|
|| endsWith(buffer, len, "িতেছিলা")
|
||||||
|
|| endsWith(buffer, len, "িয়াছিলি")
|
||||||
|
|
||||||
|
|| endsWith(buffer, len, "য়েদেরকে")
|
||||||
|
))
|
||||||
|
return len - 7;
|
||||||
|
|
||||||
|
// 6
|
||||||
|
if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
|
||||||
|
|| endsWith(buffer, len, "িতেছেন")
|
||||||
|
|| endsWith(buffer, len, "িয়াছিস")
|
||||||
|
|| endsWith(buffer, len, "িয়াছেন")
|
||||||
|
|| endsWith(buffer, len, "েছিলাম")
|
||||||
|
|| endsWith(buffer, len, "েছিলেন")
|
||||||
|
|
||||||
|
|| endsWith(buffer, len, "েদেরকে")
|
||||||
|
))
|
||||||
|
return len - 6;
|
||||||
|
|
||||||
|
// 5
|
||||||
|
if ((len > 6) && (endsWith(buffer, len, "িতেছি")
|
||||||
|
|| endsWith(buffer, len, "িতেছা")
|
||||||
|
|| endsWith(buffer, len, "িতেছে")
|
||||||
|
|| endsWith(buffer, len, "ছিলাম")
|
||||||
|
|| endsWith(buffer, len, "ছিলেন")
|
||||||
|
|| endsWith(buffer, len, "িয়াছি")
|
||||||
|
|| endsWith(buffer, len, "িয়াছা")
|
||||||
|
|| endsWith(buffer, len, "িয়াছে")
|
||||||
|
|| endsWith(buffer, len, "েছিলে")
|
||||||
|
|| endsWith(buffer, len, "েছিলা")
|
||||||
|
|
||||||
|
|| endsWith(buffer, len, "য়েদের")
|
||||||
|
|| endsWith(buffer, len, "দেরকে")
|
||||||
|
))
|
||||||
|
return len - 5;
|
||||||
|
|
||||||
|
// 4
|
||||||
|
if ((len > 5) && (endsWith(buffer, len, "িলাম")
|
||||||
|
|| endsWith(buffer, len, "িলেন")
|
||||||
|
|| endsWith(buffer, len, "িতাম")
|
||||||
|
|| endsWith(buffer, len, "িতেন")
|
||||||
|
|| endsWith(buffer, len, "িবেন")
|
||||||
|
|| endsWith(buffer, len, "ছিলি")
|
||||||
|
|| endsWith(buffer, len, "ছিলে")
|
||||||
|
|| endsWith(buffer, len, "ছিলা")
|
||||||
|
|| endsWith(buffer, len, "তেছে")
|
||||||
|
|| endsWith(buffer, len, "িতেছ")
|
||||||
|
|
||||||
|
|| endsWith(buffer, len, "খানা")
|
||||||
|
|| endsWith(buffer, len, "খানি")
|
||||||
|
|| endsWith(buffer, len, "গুলো")
|
||||||
|
|| endsWith(buffer, len, "গুলি")
|
||||||
|
|| endsWith(buffer, len, "য়েরা")
|
||||||
|
|| endsWith(buffer, len, "েদের")
|
||||||
|
))
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
|
// 3
|
||||||
|
if ((len > 4) && (endsWith(buffer, len, "লাম")
|
||||||
|
|| endsWith(buffer, len, "িলি")
|
||||||
|
|| endsWith(buffer, len, "ইলি")
|
||||||
|
|| endsWith(buffer, len, "িলে")
|
||||||
|
|| endsWith(buffer, len, "ইলে")
|
||||||
|
|| endsWith(buffer, len, "লেন")
|
||||||
|
|| endsWith(buffer, len, "িলা")
|
||||||
|
|| endsWith(buffer, len, "ইলা")
|
||||||
|
|| endsWith(buffer, len, "তাম")
|
||||||
|
|| endsWith(buffer, len, "িতি")
|
||||||
|
|| endsWith(buffer, len, "ইতি")
|
||||||
|
|| endsWith(buffer, len, "িতে")
|
||||||
|
|| endsWith(buffer, len, "ইতে")
|
||||||
|
|| endsWith(buffer, len, "তেন")
|
||||||
|
|| endsWith(buffer, len, "িতা")
|
||||||
|
|| endsWith(buffer, len, "িবা")
|
||||||
|
|| endsWith(buffer, len, "ইবা")
|
||||||
|
|| endsWith(buffer, len, "িবি")
|
||||||
|
|| endsWith(buffer, len, "ইবি")
|
||||||
|
|| endsWith(buffer, len, "বেন")
|
||||||
|
|| endsWith(buffer, len, "িবে")
|
||||||
|
|| endsWith(buffer, len, "ইবে")
|
||||||
|
|| endsWith(buffer, len, "ছেন")
|
||||||
|
|
||||||
|
|| endsWith(buffer, len, "য়োন")
|
||||||
|
|| endsWith(buffer, len, "য়ের")
|
||||||
|
|| endsWith(buffer, len, "েরা")
|
||||||
|
|| endsWith(buffer, len, "দের")
|
||||||
|
))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
// 2
|
||||||
|
if ((len > 3) && (endsWith(buffer, len, "িস")
|
||||||
|
|| endsWith(buffer, len, "েন")
|
||||||
|
|| endsWith(buffer, len, "লি")
|
||||||
|
|| endsWith(buffer, len, "লে")
|
||||||
|
|| endsWith(buffer, len, "লা")
|
||||||
|
|| endsWith(buffer, len, "তি")
|
||||||
|
|| endsWith(buffer, len, "তে")
|
||||||
|
|| endsWith(buffer, len, "তা")
|
||||||
|
|| endsWith(buffer, len, "বি")
|
||||||
|
|| endsWith(buffer, len, "বে")
|
||||||
|
|| endsWith(buffer, len, "বা")
|
||||||
|
|| endsWith(buffer, len, "ছি")
|
||||||
|
|| endsWith(buffer, len, "ছা")
|
||||||
|
|| endsWith(buffer, len, "ছে")
|
||||||
|
|| endsWith(buffer, len, "ুন")
|
||||||
|
|| endsWith(buffer, len, "ুক")
|
||||||
|
|
||||||
|
|| endsWith(buffer, len, "টা")
|
||||||
|
|| endsWith(buffer, len, "টি")
|
||||||
|
|| endsWith(buffer, len, "নি")
|
||||||
|
|| endsWith(buffer, len, "ের")
|
||||||
|
|| endsWith(buffer, len, "তে")
|
||||||
|
|| endsWith(buffer, len, "রা")
|
||||||
|
|| endsWith(buffer, len, "কে")
|
||||||
|
))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
// 1
|
||||||
|
if ((len > 2) && (endsWith(buffer, len, "ি")
|
||||||
|
|| endsWith(buffer, len, "ী")
|
||||||
|
|| endsWith(buffer, len, "া")
|
||||||
|
|| endsWith(buffer, len, "ো")
|
||||||
|
|| endsWith(buffer, len, "ে")
|
||||||
|
|| endsWith(buffer, len, "ব")
|
||||||
|
|| endsWith(buffer, len, "ত")
|
||||||
|
))
|
||||||
|
return len - 1;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,21 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Bengali Language.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
@ -17,6 +17,8 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
|
|||||||
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
|
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
|
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
|
||||||
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
||||||
|
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
|
||||||
|
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
|
||||||
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
||||||
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
|
org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
|
||||||
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
|
org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
|
||||||
|
@ -0,0 +1,121 @@
|
|||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# This file was created by Jacques Savoy and is distributed under the BSD license
|
||||||
|
এই
|
||||||
|
ও
|
||||||
|
থেকে
|
||||||
|
করে
|
||||||
|
এ
|
||||||
|
না
|
||||||
|
ওই
|
||||||
|
এক্
|
||||||
|
নিয়ে
|
||||||
|
করা
|
||||||
|
বলেন
|
||||||
|
সঙ্গে
|
||||||
|
যে
|
||||||
|
এব
|
||||||
|
তা
|
||||||
|
আর
|
||||||
|
কোনো
|
||||||
|
বলে
|
||||||
|
সেই
|
||||||
|
দিন
|
||||||
|
হয়
|
||||||
|
কি
|
||||||
|
দু
|
||||||
|
পরে
|
||||||
|
সব
|
||||||
|
দেওয়া
|
||||||
|
মধ্যে
|
||||||
|
এর
|
||||||
|
সি
|
||||||
|
শুরু
|
||||||
|
কাজ
|
||||||
|
কিছু
|
||||||
|
কাছে
|
||||||
|
সে
|
||||||
|
তবে
|
||||||
|
বা
|
||||||
|
বন
|
||||||
|
আগে
|
||||||
|
জ্নজন
|
||||||
|
পি
|
||||||
|
পর
|
||||||
|
তো
|
||||||
|
ছিল
|
||||||
|
এখন
|
||||||
|
আমরা
|
||||||
|
প্রায়
|
||||||
|
দুই
|
||||||
|
আমাদের
|
||||||
|
তাই
|
||||||
|
অন্য
|
||||||
|
গিয়ে
|
||||||
|
প্রযন্ত
|
||||||
|
মনে
|
||||||
|
নতুন
|
||||||
|
মতো
|
||||||
|
কেখা
|
||||||
|
প্রথম
|
||||||
|
আজ
|
||||||
|
টি
|
||||||
|
ধামার
|
||||||
|
অনেক
|
||||||
|
বিভিন্ন
|
||||||
|
র
|
||||||
|
হাজার
|
||||||
|
জানা
|
||||||
|
নয়
|
||||||
|
অবশ্য
|
||||||
|
বেশি
|
||||||
|
এস
|
||||||
|
করে
|
||||||
|
কে
|
||||||
|
হতে
|
||||||
|
বি
|
||||||
|
কয়েক
|
||||||
|
সহ
|
||||||
|
বেশ
|
||||||
|
এমন
|
||||||
|
এমনি
|
||||||
|
কেন
|
||||||
|
কেউ
|
||||||
|
নেওয়া
|
||||||
|
চেষ্টা
|
||||||
|
লক্ষ
|
||||||
|
বলা
|
||||||
|
কারণ
|
||||||
|
আছে
|
||||||
|
শুধু
|
||||||
|
তখন
|
||||||
|
যা
|
||||||
|
এসে
|
||||||
|
চার
|
||||||
|
ছিল
|
||||||
|
যদি
|
||||||
|
আবার
|
||||||
|
কোটি
|
||||||
|
উত্তর
|
||||||
|
সামনে
|
||||||
|
উপর
|
||||||
|
বক্তব্য
|
||||||
|
এত
|
||||||
|
প্রাথমিক
|
||||||
|
উপরে
|
||||||
|
আছে
|
||||||
|
প্রতি
|
||||||
|
কাজে
|
||||||
|
যখন
|
||||||
|
খুব
|
||||||
|
বহু
|
||||||
|
গেল
|
||||||
|
পেয়্র্
|
||||||
|
চালু
|
||||||
|
ই
|
||||||
|
নাগাদ
|
||||||
|
থাকা
|
||||||
|
পাচ
|
||||||
|
যাওয়া
|
||||||
|
রকম
|
||||||
|
সাধারণ
|
||||||
|
কমনে
|
@ -0,0 +1,53 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the BengaliAnalyzer
|
||||||
|
*/
|
||||||
|
public class TestBengaliAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new BengaliAnalyzer().close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasics() throws Exception {
|
||||||
|
Analyzer a = new BengaliAnalyzer();
|
||||||
|
|
||||||
|
checkOneTerm(a, "বাড়ী", "বার");
|
||||||
|
checkOneTerm(a, "বারী", "বার");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* test Digits
|
||||||
|
*/
|
||||||
|
public void testDigits() throws Exception {
|
||||||
|
BengaliAnalyzer a = new BengaliAnalyzer();
|
||||||
|
checkOneTerm(a, "১২৩৪৫৬৭৮৯০", "1234567890");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
Analyzer analyzer = new BengaliAnalyzer();
|
||||||
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,80 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test Bengali Filter Factory
|
||||||
|
*/
|
||||||
|
public class TestBengaliFilters extends BaseTokenStreamFactoryTestCase {
|
||||||
|
/**
|
||||||
|
* Test IndicNormalizationFilterFactory
|
||||||
|
*/
|
||||||
|
public void testIndicNormalizer() throws Exception {
|
||||||
|
Reader reader = new StringReader("ত্ আমি");
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
|
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "ৎ", "আমি" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test BengaliNormalizationFilterFactory
|
||||||
|
*/
|
||||||
|
public void testBengaliNormalizer() throws Exception {
|
||||||
|
Reader reader = new StringReader("বাড়ী");
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
|
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||||
|
stream = tokenFilterFactory("BengaliNormalization").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] {"বারি"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test BengaliStemFilterFactory
|
||||||
|
*/
|
||||||
|
public void testStemmer() throws Exception {
|
||||||
|
Reader reader = new StringReader("বাড়ী");
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
|
stream = tokenFilterFactory("IndicNormalization").create(stream);
|
||||||
|
stream = tokenFilterFactory("BengaliNormalization").create(stream);
|
||||||
|
stream = tokenFilterFactory("BengaliStem").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] {"বার"});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
|
tokenFilterFactory("IndicNormalization", "bogusArg", "bogusValue");
|
||||||
|
});
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
|
||||||
|
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
|
tokenFilterFactory("BengaliNormalization", "bogusArg", "bogusValue");
|
||||||
|
});
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
|
||||||
|
expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
|
tokenFilterFactory("BengaliStem", "bogusArg", "bogusValue");
|
||||||
|
});
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,110 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test BengaliNormalizer
|
||||||
|
*/
|
||||||
|
public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
|
||||||
|
/**
|
||||||
|
* Test some basic normalization, with an example from the paper.
|
||||||
|
*/
|
||||||
|
public void testChndrobindu() throws IOException {
|
||||||
|
check("চাঁদ", "চাদ");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRosshoIKar() throws IOException {
|
||||||
|
check("বাড়ী", "বারি");
|
||||||
|
check("তীর", "তির");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRosshoUKar() throws IOException {
|
||||||
|
check("ভূল", "ভুল");
|
||||||
|
check("অনূপ", "অনুপ");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNga() throws IOException {
|
||||||
|
check("বাঙলা", "বাংলা");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testJaPhaala() throws IOException {
|
||||||
|
check("ব্যাক্তি", "বেক্তি");
|
||||||
|
check( "সন্ধ্যা", "সন্ধা");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBaPhalaa() throws IOException {
|
||||||
|
check("স্বদেশ", "সদেস");
|
||||||
|
check("তত্ত্ব", "তত্ত");
|
||||||
|
check("বিশ্ব", "বিসস");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testVisarga() throws IOException {
|
||||||
|
check("দুঃখ", "দুখখ");
|
||||||
|
check("উঃ", "উহ");
|
||||||
|
check("পুনঃ", "পুন");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasics() throws IOException {
|
||||||
|
check("কণা", "কনা");
|
||||||
|
check("শরীর", "সরির");
|
||||||
|
check("বাড়ি", "বারি");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */
|
||||||
|
public void testRandom() throws IOException {
|
||||||
|
BengaliNormalizer normalizer = new BengaliNormalizer();
|
||||||
|
for (int i = 0; i < 100000; i++) {
|
||||||
|
String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7);
|
||||||
|
try {
|
||||||
|
int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length());
|
||||||
|
assertTrue(newLen >= 0); // should not return negative length
|
||||||
|
assertTrue(newLen <= randomBengali.length()); // should not increase length of string
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")");
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(String input, String output) throws IOException {
|
||||||
|
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
||||||
|
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
|
||||||
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new BengaliNormalizationFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,79 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test Codes for BengaliStemmer
|
||||||
|
*/
|
||||||
|
public class TestBengaliStemmer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Testing few verbal words
|
||||||
|
*/
|
||||||
|
public void testVerbsInShadhuForm() throws IOException {
|
||||||
|
check("করেছিলাম", "কর");
|
||||||
|
check("করিতেছিলে", "কর");
|
||||||
|
check("খাইতাম", "খাই");
|
||||||
|
check("যাইবে", "যা");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testVerbsInCholitoForm() throws IOException {
|
||||||
|
check("করছিলাম", "কর");
|
||||||
|
check("করছিলে", "কর");
|
||||||
|
check("করতাম", "কর");
|
||||||
|
check("যাব", "যা");
|
||||||
|
check("যাবে", "যা");
|
||||||
|
check("করি", "কর");
|
||||||
|
check("করো", "কর");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNouns() throws IOException {
|
||||||
|
check("মেয়েরা", "মে");
|
||||||
|
check("মেয়েদেরকে", "মে");
|
||||||
|
check("মেয়েদের", "মে");
|
||||||
|
|
||||||
|
check("একটি", "এক");
|
||||||
|
check("মানুষগুলি", "মানুষ");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void check(String input, String output) throws IOException {
|
||||||
|
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
||||||
|
TokenFilter tf = new BengaliStemFilter(tokenizer);
|
||||||
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new BengaliStemFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user