LUCENE-7940: add more efficient (failing) test for BengaliNormalizer

This commit is contained in:
Robert Muir 2017-09-01 01:22:52 -04:00
parent 755f6cc6a8
commit 1fbb400e6f
1 changed files with 17 additions and 0 deletions

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.TestUtil;
import java.io.IOException;
@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase {
check("বাড়ি", "বারি");
}
/** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */
public void testRandom() throws IOException {
BengaliNormalizer normalizer = new BengaliNormalizer();
for (int i = 0; i < 100000; i++) {
String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7);
try {
int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length());
assertTrue(newLen >= 0); // should not return negative length
assertTrue(newLen <= randomBengali.length()); // should not increase length of string
} catch (Exception e) {
System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")");
throw e;
}
}
}
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = whitespaceMockTokenizer(input);
TokenFilter tf = new BengaliNormalizationFilter(tokenizer);