From 755f6cc6a80f6060e240b715b9f22b480f70d8e1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 31 Aug 2017 23:49:03 -0400 Subject: [PATCH 1/2] LUCENE-7940: removed unused import and javadocs fix so that ant precommit succeeds --- .../java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java | 4 ++-- .../org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java index 912c4dd125c..4f8ec06742d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliAnalyzer.java @@ -96,10 +96,10 @@ public final class BengaliAnalyzer extends StopwordAnalyzerBase { /** * Creates - * {@link TokenStreamComponents} + * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * - * @return {@link TokenStreamComponents} + * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter}, * {@link BengaliNormalizationFilter}, {@link SetKeywordMarkerFilter} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java index 898480a73cc..e04f209746e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliAnalyzer.java @@ -18,8 +18,6 @@ package org.apache.lucene.analysis.bn; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CharArraySet; - /** * Tests the BengaliAnalyzer From 1fbb400e6f02c1443cd84b186c9d9169c2d17e53 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 1 Sep 2017 01:22:52 -0400 Subject: [PATCH 2/2] LUCENE-7940: add more efficient (failing) test for BengaliNormalizer --- .../analysis/bn/TestBengaliNormalizer.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java index ecd11ae4ba2..b8073c9dda4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/bn/TestBengaliNormalizer.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.util.TestUtil; import java.io.IOException; @@ -73,6 +74,22 @@ public class TestBengaliNormalizer extends BaseTokenStreamTestCase { check("বাড়ি", "বারি"); } + /** creates random strings in the bengali block and ensures the normalizer doesn't trip up on them */ + public void testRandom() throws IOException { + BengaliNormalizer normalizer = new BengaliNormalizer(); + for (int i = 0; i < 100000; i++) { + String randomBengali = TestUtil.randomSimpleStringRange(random(), '\u0980', '\u09FF', 7); + try { + int newLen = normalizer.normalize(randomBengali.toCharArray(), randomBengali.length()); + assertTrue(newLen >= 0); // should not return negative length + assertTrue(newLen <= randomBengali.length()); // should not increase length of string + } catch (Exception e) { + System.err.println("normalizer failed on input: '" + randomBengali + "' (" + escape(randomBengali) + ")"); + throw e; + } + } + } + private void check(String input, String output) throws IOException { Tokenizer tokenizer = whitespaceMockTokenizer(input); TokenFilter tf = new BengaliNormalizationFilter(tokenizer);