LUCENE-3717: add better offsets testing to BaseTokenStreamTestCase, fix offsets bugs in ThaiWordFilter and ICUTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234652 13f79535-47bb-0310-9956-ffa450edef68
2012-01-23 00:08:52 +00:00 · 2012-01-23 00:08:52 +00:00 · c754c1c9c8
parent f7a474d603
commit c754c1c9c8
8 changed files with 205 additions and 19 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -814,10 +814,10 @@ Bug fixes
 * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
  to clones/reopened readers.  (Uwe Schindler)
-* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, 
+* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters, 
-  compound token filters, and smart chinese where they would create invalid 
+  compound token filters, thai word filter, icutokenizer, and smart chinese 
-  offsets in some situations, leading to problems in highlighting. 
+  where they would create invalid offsets in some situations, leading to problems 
-  (Max Beutel, Edwin Steiner via Robert Muir)
+  in highlighting.  (Max Beutel, Edwin Steiner via Robert Muir)
 * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
  Float.MIN_VALUE when it should be Float.NaN, when there were 0
--- a/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import java.io.IOException;
 import java.util.ArrayList;
@ -291,6 +292,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  };
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
  }
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
    for (int i = 0; i < iterations; i++) {
      String text;
      switch(_TestUtil.nextInt(random, 0, 4)) {
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
      }
-      TokenStream ts = a.tokenStream("dummy", new StringReader(text));
+      int remainder = random.nextInt(10);
      Reader reader = new StringReader(text);
      TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        if (VERBOSE) {
          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
        }
        reader = new StringReader(text);
        ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
        if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
          // offset + pos + type
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
            tokens.toArray(new String[tokens.size()]),
            toIntArray(startOffsets),
            toIntArray(endOffsets),
            types.toArray(new String[types.size()]),
-            toIntArray(positions));
+            toIntArray(positions),
            text.length());
        } else if (posIncAtt != null && offsetAtt != null) {
          // offset + pos
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
              toIntArray(endOffsets),
-              toIntArray(positions));
+              null,
              toIntArray(positions),
              text.length());
        } else if (offsetAtt != null) {
          // offset
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
-              toIntArray(endOffsets));
+              toIntArray(endOffsets),
              null,
              null,
              text.length());
        } else {
          // terms only
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]));
        }
      }
--- a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
@ -0,0 +1,100 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.util.SortedMap;
 import java.util.TreeMap;
 // the purpose of this charfilter is to send offsets out of bounds
 // if the analyzer doesn't use correctOffset or does incorrect offset math.
 class MockCharFilter extends CharStream {
  final Reader in;
  final int remainder;
  // for testing only
  public MockCharFilter(Reader in, int remainder) {
    this.in = in;
    this.remainder = remainder;
    assert remainder >= 0 && remainder < 10 : "invalid parameter";
  }
  @Override
  public void close() throws IOException {
    in.close();
  }
  int currentOffset = -1;
  int delta = 0;
  int bufferedCh = -1;
  @Override
  public int read() throws IOException {
    // we have a buffered character, add an offset correction and return it
    if (bufferedCh >= 0) {
      int ch = bufferedCh;
      bufferedCh = -1;
      currentOffset++;
      addOffCorrectMap(currentOffset+delta, delta-1);
      delta--;
      return ch;
    }
    // otherwise actually read one    
    int ch = in.read();
    if (ch < 0)
      return ch;
    currentOffset++;
    if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
      return ch;
    }
    // we will double this character, so buffer it.
    bufferedCh = ch;
    return ch;
  }
  @Override
  public int read(char[] cbuf, int off, int len) throws IOException {
    int numRead = 0;
    for (int i = off; i < off + len; i++) {
      int c = read();
      if (c == -1) break;
      cbuf[i] = (char) c;
      numRead++;
    }
    return numRead == 0 ? -1 : numRead;
  }
  @Override
  public int correctOffset(int currentOff) {
    SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
    int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
    assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
    return ret;
  }
  protected void addOffCorrectMap(int off, int cumulativeDiff) {
    corrections.put(off, cumulativeDiff);
  }
  TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
 }
--- a/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
+++ b/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
@ -0,0 +1,58 @@
 package org.apache.lucene.analysis;
 import java.io.IOException;
 import java.io.Reader;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 public class TestMockCharFilter extends BaseTokenStreamTestCase {
  public void test() throws IOException {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
      @Override
      protected Reader initReader(Reader reader) {
        return new MockCharFilter(CharReader.get(reader), 7);
      }
    };
    assertAnalyzesTo(analyzer, "ab",
        new String[] { "aab" },
        new int[] { 0 },
        new int[] { 2 }
    );
    assertAnalyzesTo(analyzer, "aba",
        new String[] { "aabaa" },
        new int[] { 0 },
        new int[] { 3 }
    );
    assertAnalyzesTo(analyzer, "abcdefga",
        new String[] { "aabcdefgaa" },
        new int[] { 0 },
        new int[] { 8 }
    );
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
  private CharTermAttribute clonedTermAtt = null;
  private OffsetAttribute clonedOffsetAtt = null;
  private boolean hasMoreTokensInClone = false;
  private boolean hasIllegalOffsets = false; // only if the length changed before this filter
  /** Creates a new ThaiWordFilter with the specified match version. */
  public ThaiWordFilter(Version matchVersion, TokenStream input) {
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
      if (end != BreakIterator.DONE) {
        clonedToken.copyTo(this);
        termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
-        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        if (hasIllegalOffsets) {
          offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
        } else {
          offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
        }
        if (handlePosIncr) posAtt.setPositionIncrement(1);
        return true;
      }
@ -103,6 +108,10 @@ public final class ThaiWordFilter extends TokenFilter {
    hasMoreTokensInClone = true;
    // if length by start + end offsets doesn't match the term text then assume
    // this is a synonym and don't adjust the offsets.
    hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
    // we lazy init the cloned token, as in ctor not all attributes may be added
    if (clonedToken == null) {
      clonedToken = cloneAttributes();
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
    int end = breaker.next();
    if (end != BreakIterator.DONE) {
      termAtt.setLength(end);
-      offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      if (hasIllegalOffsets) {
        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
      } else {
        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
      }
      // position increment keeps as it is for first token
      return true;
    }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      @Override
      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+        return new HTMLStripCharFilter(CharReader.get(reader));
      }
    };
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
        hasSentence = false;
        clearAttributes();
        termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
-        offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+        offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
        return true;
      } else {
        return false;
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
      clearAttributes();
      termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
-      offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+      offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
      posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
      posBoost = 0;
      return true;
--- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
  @Override
  public void end() throws IOException {
    final int finalOffset = (length < 0) ? offset : offset + length;
-    offsetAtt.setOffset(finalOffset, finalOffset);
+    offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
  }  
  /*