Merge pull request #1 from rmuir/bengali_fixes

LUCENE-7940: minor bengali fixes
This commit is contained in:
Md.Abdulla-Al-Sun 2017-09-02 19:58:09 +06:00 committed by GitHub
commit 7d468c45a9
3 changed files with 19 additions and 4 deletions

View File

@ -96,10 +96,10 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase {
/**
* Creates
* {@link TokenStreamComponents}
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link TokenStreamComponents}
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}

View File

@ -18,8 +18,6 @@ package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
/**
* Tests the BengaliAnalyzer

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.TestUtil;
import java.io.IOException;
@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
check("বাড়ি", "বারি");
}
/** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */
public void testRandom() throws IOException {
BengaliNormalizer normalizer = new BengaliNormalizer();
for (int i = 0; i < 100000; i++) {
String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7);
try {
int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length());
assertTrue(newLen >= 0); // should not return negative length
assertTrue(newLen <= randomBengali.length()); // should not increase length of string
} catch (Exception e) {
System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")");
throw e;
}
}
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = whitespaceMockTokenizer(input);
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);