LUCENE-2014: SmartChineseAnalyzer position increment bug

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@830871 13f79535-47bb-0310-9956-ffa450edef68
2025-03-02 22:39:29 +00:00 · 2009-10-29 09:22:37 +00:00 · 2009-10-29 09:22:37 +00:00 · 1b38f9c24d
commit 1b38f9c24d
parent 5c40eb4715
3 changed files with 21 additions and 1 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -55,6 +55,11 @@ Bug fixes
   
 * LUCENE-2003: Highlighter doesn't respect position increments other than 1 with 
   PhraseQuerys. (Uwe Schindler, Mark Miller)
+   
+ * LUCENE-2014: SmartChineseAnalyzer did not properly clear attributes
+   in WordTokenFilter. If enablePositionIncrements is set for StopFilter,
+   then this could create invalid position increments, causing IndexWriter
+   to crash.  (Robert Muir, Uwe Schindler)

 New features

--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
@ -78,7 +78,8 @@ public final class WordTokenFilter extends TokenFilter {
        return false; // no more sentences, end of stream!
      }
    } 
-    
+    // WordTokenFilter must clear attributes, as it is creating new tokens.
+    clearAttributes();
    // There are remaining tokens from the current sentence, return the next one. 
    SegToken nextWord = (SegToken) tokenIter.next();
    termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
--- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
@ -80,6 +80,20 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
    }
  }
  
+  /*
+   * Check that position increments after stopwords are correct,
+   * when stopfilter is configured with enablePositionIncrements
+   */
+  public void testChineseStopWords2() throws Exception {
+    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); /* will load stopwords */
+    String sentence = "Title:San"; // : is a stopword
+    String result[] = { "titl", "san"};
+    int startOffsets[] = { 0, 6 };
+    int endOffsets[] = { 5, 9 };
+    int posIncr[] = { 1, 2 };
+    assertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, posIncr);
+  }
+  
  public void testChineseAnalyzer() throws Exception {
    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT, true);
    String sentence = "我购买了道具和服装。";