From ecd795c5855b2829d1a8bafc9af05ced4a1a0c57 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 14 Apr 2011 15:15:31 +0000 Subject: [PATCH] LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException on sentences longer than 32,767 characters git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092328 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 5 +++- .../analysis/cn/smart/hhmm/SegGraph.java | 2 +- .../cn/smart/TestSmartChineseAnalyzer.java | 29 +++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index a4a474b512a..ee5eddd829f 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -45,7 +45,10 @@ API Changes ======================= Lucene 3.x (not yet released) ======================= -(No changes) +Bug fixes + + * LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException + on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir) ======================= Lucene 3.1.0 ======================= diff --git a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java index 5434fe4aa73..f7b40d54c5b 100644 --- a/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java +++ b/modules/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java @@ -75,7 +75,7 @@ class SegGraph { List result = new ArrayList(); int s = -1, count = 0, size = tokenListTable.size(); List tokenList; - short index = 0; + int index = 0; while (count < size) { if (isStartExist(s)) { tokenList = tokenListTable.get(s); diff --git a/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java b/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java index 01b2d94c894..6c61d45d27b 100644 --- a/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java +++ b/modules/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java @@ -17,8 +17,11 @@ package org.apache.lucene.analysis.cn.smart; +import java.io.StringReader; + import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Version; public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { @@ -166,4 +169,30 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase { new int[] { 0, 1, 3, 4, 6, 7 }, new int[] { 1, 3, 4, 6, 7, 9 }); } + + // LUCENE-3026 + public void testLargeDocument() throws Exception { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 5000; i++) { + sb.append("我购买了道具和服装。"); + } + Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT); + TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString())); + stream.reset(); + while (stream.incrementToken()) { + } + } + + // LUCENE-3026 + public void testLargeSentence() throws Exception { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 5000; i++) { + sb.append("我购买了道具和服装"); + } + Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT); + TokenStream stream = analyzer.reusableTokenStream("", new StringReader(sb.toString())); + stream.reset(); + while (stream.incrementToken()) { + } + } }