LUCENE-3717: add better offsets testing to BaseTokenStreamTestCase, fix offsets bugs in ThaiWordFilter and ICUTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234652 13f79535-47bb-0310-9956-ffa450edef68
2012-01-23 00:08:52 +00:00 · 2012-01-23 00:08:52 +00:00 · c754c1c9c8
parent f7a474d603
commit c754c1c9c8
8 changed files with 205 additions and 19 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -814,10 +814,10 @@ Bug fixes
 * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
  to clones/reopened readers.  (Uwe Schindler)

-* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, 
-  compound token filters, and smart chinese where they would create invalid 
-  offsets in some situations, leading to problems in highlighting. 
-  (Max Beutel, Edwin Steiner via Robert Muir)
+* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters, 
+  compound token filters, thai word filter, icutokenizer, and smart chinese 
+  where they would create invalid offsets in some situations, leading to problems 
+  in highlighting.  (Max Beutel, Edwin Steiner via Robert Muir)

 * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
  Float.MIN_VALUE when it should be Float.NaN, when there were 0
--- a/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

+import java.io.Reader;
 import java.io.StringReader;
 import java.io.IOException;
 import java.util.ArrayList;
@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      }
    }
  };
-
+  
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
+    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
+  }
+
+  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
    for (int i = 0; i < iterations; i++) {
      String text;
      switch(_TestUtil.nextInt(random, 0, 4)) {
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
      }

-      TokenStream ts = a.tokenStream("dummy", new StringReader(text));
+      int remainder = random.nextInt(10);
+      Reader reader = new StringReader(text);
+      TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        if (VERBOSE) {
          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
        }
+        reader = new StringReader(text);
+        ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
        if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
          // offset + pos + type
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
            tokens.toArray(new String[tokens.size()]),
            toIntArray(startOffsets),
            toIntArray(endOffsets),
            types.toArray(new String[types.size()]),
-            toIntArray(positions));
+            toIntArray(positions),
+            text.length());
        } else if (posIncAtt != null && offsetAtt != null) {
          // offset + pos
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
              toIntArray(endOffsets),
-              toIntArray(positions));
+              null,
+              toIntArray(positions),
+              text.length());
        } else if (offsetAtt != null) {
          // offset
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
-              toIntArray(endOffsets));
+              toIntArray(endOffsets),
+              null,
+              null,
+              text.length());
        } else {
          // terms only
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]));
        }
      }
--- a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
@ -0,0 +1,100 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+// the purpose of this charfilter is to send offsets out of bounds
+// if the analyzer doesn't use correctOffset or does incorrect offset math.
+class MockCharFilter extends CharStream {
+  final Reader in;
+  final int remainder;
+  
+  // for testing only
+  public MockCharFilter(Reader in, int remainder) {
+    this.in = in;
+    this.remainder = remainder;
+    assert remainder >= 0 && remainder < 10 : "invalid parameter";
+  }
+
+  @Override
+  public void close() throws IOException {
+    in.close();
+  }
+  
+  int currentOffset = -1;
+  int delta = 0;
+  int bufferedCh = -1;
+  
+  @Override
+  public int read() throws IOException {
+    // we have a buffered character, add an offset correction and return it
+    if (bufferedCh >= 0) {
+      int ch = bufferedCh;
+      bufferedCh = -1;
+      currentOffset++;
+      
+      addOffCorrectMap(currentOffset+delta, delta-1);
+      delta--;
+      return ch;
+    }
+    
+    // otherwise actually read one    
+    int ch = in.read();
+    if (ch < 0)
+      return ch;
+    
+    currentOffset++;
+    if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
+      return ch;
+    }
+    
+    // we will double this character, so buffer it.
+    bufferedCh = ch;
+    return ch;
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    int numRead = 0;
+    for (int i = off; i < off + len; i++) {
+      int c = read();
+      if (c == -1) break;
+      cbuf[i] = (char) c;
+      numRead++;
+    }
+    return numRead == 0 ? -1 : numRead;
+  }
+
+  @Override
+  public int correctOffset(int currentOff) {
+    SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
+    int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
+    assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
+    return ret;
+  }
+  
+  protected void addOffCorrectMap(int off, int cumulativeDiff) {
+    corrections.put(off, cumulativeDiff);
+  }
+  
+  TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
+}
--- a/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
+++ b/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestMockCharFilter extends BaseTokenStreamTestCase {
+  
+  public void test() throws IOException {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MockCharFilter(CharReader.get(reader), 7);
+      }
+    };
+    
+    assertAnalyzesTo(analyzer, "ab",
+        new String[] { "aab" },
+        new int[] { 0 },
+        new int[] { 2 }
+    );
+    
+    assertAnalyzesTo(analyzer, "aba",
+        new String[] { "aabaa" },
+        new int[] { 0 },
+        new int[] { 3 }
+    );
+    
+    assertAnalyzesTo(analyzer, "abcdefga",
+        new String[] { "aabcdefgaa" },
+        new int[] { 0 },
+        new int[] { 8 }
+    );
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
  private CharTermAttribute clonedTermAtt = null;
  private OffsetAttribute clonedOffsetAtt = null;
  private boolean hasMoreTokensInClone = false;
+  private boolean hasIllegalOffsets = false; // only if the length changed before this filter

  /** Creates a new ThaiWordFilter with the specified match version. */
  public ThaiWordFilter(Version matchVersion, TokenStream input) {
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
      if (end != BreakIterator.DONE) {
        clonedToken.copyTo(this);
        termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
-        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        if (hasIllegalOffsets) {
+          offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+        } else {
+          offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        }
        if (handlePosIncr) posAtt.setPositionIncrement(1);
        return true;
      }
@ -102,6 +107,10 @@ public final class ThaiWordFilter extends TokenFilter {
    }
    
    hasMoreTokensInClone = true;
+    
+    // if length by start + end offsets doesn't match the term text then assume
+    // this is a synonym and don't adjust the offsets.
+    hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();

    // we lazy init the cloned token, as in ctor not all attributes may be added
    if (clonedToken == null) {
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
    int end = breaker.next();
    if (end != BreakIterator.DONE) {
      termAtt.setLength(end);
-      offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      if (hasIllegalOffsets) {
+        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+      } else {
+        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      }
      // position increment keeps as it is for first token
      return true;
    }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {

      @Override
      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+        return new HTMLStripCharFilter(CharReader.get(reader));
      }
    };
    
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
        hasSentence = false;
        clearAttributes();
        termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
-        offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+        offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
        return true;
      } else {
        return false;
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
      
      clearAttributes();
      termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
-      offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+      offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
      posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
      posBoost = 0;
      return true;
--- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
  @Override
  public void end() throws IOException {
    final int finalOffset = (length < 0) ? offset : offset + length;
-    offsetAtt.setOffset(finalOffset, finalOffset);
+    offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
  }  

  /*