diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java index 912c4dd125c..4f8ec06742d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java @@ -96,10 +96,10 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase { /** * Creates - * {@link TokenStreamComponents} + * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * - * @return {@link TokenStreamComponents} + * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter}, * {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java index 898480a73cc..e04f209746e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java @@ -18,8 +18,6 @@ package org.apache.lucene.analysis.bn; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CharArraySet; - /** * Tests the BengaliAnalyzer diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java index ecd11ae4ba2..b8073c9dda4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.util.TestUtil; import java.io.IOException; @@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase { check("বাড়ি", "বারি"); } + /** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */ + public void testRandom() throws IOException { + BengaliNormalizer normalizer = new BengaliNormalizer(); + for (int i = 0; i < 100000; i++) { + String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7); + try { + int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length()); + assertTrue(newLen >= 0); // should not return negative length + assertTrue(newLen <= randomBengali.length()); // should not increase length of string + } catch (Exception e) { + System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")"); + throw e; + } + } + } + private void check(String input, String output) throws IOException { Tokenizer tokenizer = whitespaceMockTokenizer(input); TokenFilter tf = new BengaliNormalizationFilter(tokenizer);