mirror of https://github.com/apache/lucene.git
Merge pull request #1 from rmuir/bengali_fixes
LUCENE-7940: minor bengali fixes
This commit is contained in:
commit
7d468c45a9
|
@ -96,10 +96,10 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates
|
* Creates
|
||||||
* {@link TokenStreamComponents}
|
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
* used to tokenize all the text in the provided {@link Reader}.
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents}
|
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
* built from a {@link StandardTokenizer} filtered with
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
|
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
|
||||||
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
* {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter}
|
||||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.analysis.bn;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests the BengaliAnalyzer
|
* Tests the BengaliAnalyzer
|
||||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
|
||||||
check("বাড়ি", "বারি");
|
check("বাড়ি", "বারি");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */
|
||||||
|
public void testRandom() throws IOException {
|
||||||
|
BengaliNormalizer normalizer = new BengaliNormalizer();
|
||||||
|
for (int i = 0; i < 100000; i++) {
|
||||||
|
String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7);
|
||||||
|
try {
|
||||||
|
int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length());
|
||||||
|
assertTrue(newLen >= 0); // should not return negative length
|
||||||
|
assertTrue(newLen <= randomBengali.length()); // should not increase length of string
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")");
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void check(String input, String output) throws IOException {
|
private void check(String input, String output) throws IOException {
|
||||||
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
Tokenizer tokenizer = whitespaceMockTokenizer(input);
|
||||||
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
|
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
|
||||||
|
|
Loading…
Reference in New Issue