LUCENE-3894: some tokenizers weren't reading all input chars

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303193 13f79535-47bb-0310-9956-ffa450edef68
2012-03-20 23:02:37 +00:00 · 2012-03-20 23:02:37 +00:00 · c20242721f
parent d5683bea96
commit c20242721f
7 changed files with 242 additions and 18 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -271,6 +271,10 @@ Bug Fixes
 * LUCENE-3831: avoid NPE if the SpanQuery has a null field (eg a
   SpanOrQuery with no clauses added).  (Alan Woodward via Mike
   McCandless).
 * LUCENE-3894: ICUTokenizer, NGramTokenzire and EdgeNGramTokenizer
   could stop early if the Reader only partially fills the provided
   buffer 
 Documentation
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -177,8 +177,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    }
    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
    ts.end();
-    if (finalOffset != null)
+    if (finalOffset != null) {
      assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
      assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
@ -391,6 +392,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      List<Integer> startOffsets = new ArrayList<Integer>();
      List<Integer> endOffsets = new ArrayList<Integer>();
      ts.reset();
      // First pass: save away "correct" tokens
      while (ts.incrementToken()) {
        tokens.add(termAtt.toString());
        if (typeAtt != null) types.add(typeAtt.type());
@ -403,12 +406,98 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      }
      ts.end();
      ts.close();
      // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
      if (!tokens.isEmpty()) {
        // KWTokenizer (for example) can produce a token
        // even when input is length 0:
        if (text.length() != 0) {
          // (Optional) second pass: do something evil:
          final int evilness = random.nextInt(50);
          if (evilness == 17) {
            if (VERBOSE) {
              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
            }
            // Throw an errant exception from the Reader:
            MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
            evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
            reader = evilReader;
            try {
              // NOTE: some Tokenizers go and read characters
              // when you call .setReader(Reader), eg
              // PatternTokenizer.  This is a bit
              // iffy... (really, they should only
              // pull from the Reader when you call
              // .incremenToken(), I think?), but we
              // currently allow it, so, we must call
              // a.tokenStream inside the try since we may
              // hit the exc on init:
              ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
              ts.reset();
              while (ts.incrementToken());
              fail("did not hit exception");
            } catch (RuntimeException re) {
              assertTrue(MockReaderWrapper.isMyEvilException(re));
            }
            try {
              ts.end();
            } catch (AssertionError ae) {
              // Catch & ignore MockTokenizer's
              // anger...
              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
                // OK
              } else {
                throw ae;
              }
            }
            ts.close();
          } else if (evilness == 7) {
            // Only consume a subset of the tokens:
            final int numTokensToRead = random.nextInt(tokens.size());
            if (VERBOSE) {
              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
            }
            reader = new StringReader(text);
            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
            ts.reset();
            for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
              assertTrue(ts.incrementToken());
            }
            try {
              ts.end();
            } catch (AssertionError ae) {
              // Catch & ignore MockTokenizer's
              // anger...
              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
                // OK
              } else {
                throw ae;
              }
            }
            ts.close();
          }
        }
        // Final pass: verify clean tokenization matches
        // results from first pass:
        if (VERBOSE) {
          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
        }
        reader = new StringReader(text);
        if (random.nextInt(30) == 7) {
          if (VERBOSE) {
            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
          }
          reader = new MockReaderWrapper(random, reader);
        }
        ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
        if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
          // offset + pos + posLength + type
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java
@ -0,0 +1,98 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Random;
 import org.apache.lucene.util._TestUtil;
 /** Wraps a Reader, and can throw random or fixed
 *  exceptions, and spoon feed read chars. */
 public class MockReaderWrapper extends Reader {
  private final Reader in;
  private final Random random;
  private int excAtChar = -1;
  private int readSoFar;
  private boolean throwExcNext;
  public MockReaderWrapper(Random random, Reader in) {
    this.in = in;
    this.random = random;
  }
  /** Throw an exception after reading this many chars. */
  public void throwExcAfterChar(int charUpto) {
    excAtChar = charUpto;
    // You should only call this on init!:
    assert readSoFar == 0;
  }
  public void throwExcNext() {
    throwExcNext = true;
  }
  @Override
  public void close() throws IOException {
    in.close();
  }
  @Override
  public int read(char[] cbuf, int off, int len) throws IOException {
    if (throwExcNext || (excAtChar != -1 && readSoFar >= excAtChar)) {
      throw new RuntimeException("fake exception now!");
    }
    final int read;
    final int realLen;
    if (len == 1) {
      realLen = 1;
    } else {
      // Spoon-feed: intentionally maybe return less than
      // the consumer asked for
      realLen = _TestUtil.nextInt(random, 1, len);
    }
    if (excAtChar != -1) {
      final int left = excAtChar - readSoFar;
      assert left != 0;
      read = in.read(cbuf, off, Math.min(realLen, left));
      assert read != -1;
      readSoFar += read;
    } else {
      read = in.read(cbuf, off, realLen);
    }
    return read;
  }
  @Override
  public boolean markSupported() {
    return false;
  }
  @Override
  public boolean ready() {
    return false;
  }
  public static boolean isMyEvilException(Throwable t) {
    return (t instanceof RuntimeException) && "fake exception now!".equals(t.getMessage());
  }
 };
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
@ -199,8 +199,11 @@ public class MockTokenizer extends Tokenizer {
    offsetAtt.setOffset(finalOffset, finalOffset);
    // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
    // these tests should disable this check (in general you should consume the entire stream)
-    assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
+    try {
-    streamState = State.END;
+      assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
    } finally {
      streamState = State.END;
    }
  }
  /** 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@ -183,15 +183,22 @@ public final class EdgeNGramTokenizer extends Tokenizer {
    // if we are just starting, read the whole input
    if (!started) {
      started = true;
      gramSize = minGram;
      char[] chars = new char[1024];
-      charsRead = input.read(chars);
+      charsRead = 0;
-      if (charsRead < 0) {
+      // TODO: refactor to a shared readFully somewhere:
-        charsRead = inLen = 0;
+      while (charsRead < chars.length) {
        int inc = input.read(chars, charsRead, chars.length-charsRead);
        if (inc == -1) {
          break;
        }
        charsRead += inc;
      }
      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
      inLen = inStr.length();
      if (inLen == 0) {
        return false;
      }
      inStr = new String(chars, 0, charsRead).trim();  // remove any leading or trailing spaces
      inLen = inStr.length();
      gramSize = minGram;
    }
    // if the remaining input is too short, we can't generate any n-grams
@ -223,7 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
  @Override
  public void reset(Reader input) throws IOException {
    super.reset(input);
    reset();
  }
  @Override
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -105,13 +105,20 @@ public final class NGramTokenizer extends Tokenizer {
      started = true;
      gramSize = minGram;
      char[] chars = new char[1024];
-      charsRead = input.read(chars);
+      charsRead = 0;
-      if (charsRead < 0) {
+      // TODO: refactor to a shared readFully somewhere:
-        charsRead = inLen = 0;
+      while (charsRead < chars.length) {
        int inc = input.read(chars, charsRead, chars.length-charsRead);
        if (inc == -1) {
          break;
        }
        charsRead += inc;
      }
      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
      inLen = inStr.length();
      if (inLen == 0) {
        return false;
      }
      inStr = new String(chars).trim();  // remove any trailing empty strings 
      inLen = inStr.length();
    }
    if (pos+gramSize > inLen) {            // if we hit the end of the string
@ -140,7 +147,6 @@ public final class NGramTokenizer extends Tokenizer {
  @Override
  public void reset(Reader input) throws IOException {
    super.reset(input);
    reset();
  }
  @Override
--- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@ -151,8 +151,8 @@ public final class ICUTokenizer extends Tokenizer {
    int leftover = length - usableLength;
    System.arraycopy(buffer, usableLength, buffer, 0, leftover);
    int requested = buffer.length - leftover;
-    int returned = input.read(buffer, leftover, requested);
+    int returned = read(input, buffer, leftover, requested);
-    length = returned < 0 ? leftover : returned + leftover;
+    length = returned + leftover;
    if (returned < requested) /* reader has been emptied, process the rest */
      usableLength = length;
    else { /* still more data to be read, find a safe-stopping place */
@ -167,6 +167,24 @@ public final class ICUTokenizer extends Tokenizer {
    breaker.setText(buffer, 0, Math.max(0, usableLength));
  }
  // TODO: refactor to a shared readFully somewhere
  // (NGramTokenizer does this too):
  /** commons-io's readFully, but without bugs if offset != 0 */
  private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
    assert length >= 0 : "length must not be negative: " + length;
    int remaining = length;
    while ( remaining > 0 ) {
      int location = length - remaining;
      int count = input.read( buffer, offset + location, remaining );
      if ( -1 == count ) { // EOF
        break;
      }
      remaining -= count;
    }
    return length - remaining;
  }
  /*
   * return true if there is a token from the buffer, or null if it is
   * exhausted.