mirror of https://github.com/apache/lucene.git
Merge pull request #1 from rmuir/bengali_fixes
LUCENE-7940: minor bengali fixes
This commit is contained in:
commit
7d468c45a9
|
@ -96,10 +96,10 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/**
|
||||
* Creates
|
||||
* {@link TokenStreamComponents}
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link TokenStreamComponents}
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
|
||||
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.analysis.bn;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
|
||||
/**
|
||||
* Tests the BengaliAnalyzer
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
|
|||
check("বাড়ি", "বারি");
|
||||
}
|
||||
|
||||
/** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */
|
||||
public void testRandom() throws IOException {
|
||||
BengaliNormalizer normalizer = new BengaliNormalizer();
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7);
|
||||
try {
|
||||
int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length());
|
||||
assertTrue(newLen >= 0); // should not return negative length
|
||||
assertTrue(newLen <= randomBengali.length()); // should not increase length of string
|
||||
} catch (Exception e) {
|
||||
System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")");
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
||||
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
|
||||
|
|
Loading…
Reference in New Issue