From dd7bfc78d96b5e31675e894b21324969fe3cbdcc Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 21 Mar 2012 02:54:07 +0000 Subject: [PATCH] LUCENE-3894: for tokenizers, add some tests for larger documents git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303258 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/BaseTokenStreamTestCase.java | 7 ++++++- .../analysis/icu/segmentation/TestICUTokenizer.java | 5 +++++ .../lucene/analysis/kuromoji/TestExtendedMode.java | 10 ++++++++++ .../lucene/analysis/kuromoji/TestKuromojiAnalyzer.java | 8 ++++++++ .../analysis/kuromoji/TestKuromojiTokenizer.java | 8 ++++++++ .../analysis/cn/smart/TestSmartChineseAnalyzer.java | 5 +++++ 6 files changed, 42 insertions(+), 1 deletion(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index fc43980e248..b1960d7bc05 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -295,7 +295,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */ public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException { - checkRandomData(random, a, iterations, false); + checkRandomData(random, a, iterations, 20, false); + } + + /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */ + public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { + checkRandomData(random, a, iterations, maxWordLength, false); } /** diff --git a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java index c768e0fbf52..74a0856d5b3 100644 --- a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java +++ b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -236,4 +236,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); } + + /** blast some random large strings through the analyzer */ + public void testRandomHugeStrings() throws Exception { + checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192); + } } diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java index e66556e7976..32cf2de8fc8 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java @@ -59,4 +59,14 @@ public class TestExtendedMode extends BaseTokenStreamTestCase { } } } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); + } + + /** blast some random large strings through the analyzer */ + public void testRandomHugeStrings() throws Exception { + checkRandomData(random, analyzer, 200*RANDOM_MULTIPLIER, 8192); + } } diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java index a42d0df637d..f98b4e163d9 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java @@ -127,6 +127,14 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase { KuromojiAnalyzer.getDefaultStopTags()); checkRandomData(random, a, atLeast(10000)); } + + /** blast some random large strings through the analyzer */ + public void testRandomHugeStrings() throws Exception { + final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH, + KuromojiAnalyzer.getDefaultStopSet(), + KuromojiAnalyzer.getDefaultStopTags()); + checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192); + } // Copied from TestKuromojiTokenizer, to make sure passing // user dict to analyzer works: diff --git a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java index bc884efaa14..36ecc795e71 100644 --- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java +++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java @@ -41,6 +41,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; +import org.junit.Ignore; public class TestKuromojiTokenizer extends BaseTokenStreamTestCase { @@ -190,6 +191,13 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase { checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER); } + /** blast some random large strings through the analyzer */ + @Ignore("FIXME: see LUCENE-3897") + public void testRandomHugeStrings() throws Exception { + checkRandomData(random, analyzer, 200*RANDOM_MULTIPLIER, 8192); + checkRandomData(random, analyzerNoPunct, 200*RANDOM_MULTIPLIER, 8192); + } + public void testLargeDocReliability() throws Exception { for (int i = 0; i < 100; i++) { String s = _TestUtil.randomUnicodeString(random, 10000); diff --git a/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java b/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java index 78fe87f8691..77489f440c6 100644 --- a/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java +++ b/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java @@ -223,4 +223,9 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); } + + /** blast some random large strings through the analyzer */ + public void testRandomHugeStrings() throws Exception { + checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192); + } }