From c20242721f9ee1c061c9323cee36c31e0605f61f Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Tue, 20 Mar 2012 23:02:37 +0000
Subject: [PATCH] LUCENE-3894: some tokenizers weren't reading all input chars

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303193 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/contrib/CHANGES.txt                    |  4 +
 .../analysis/BaseTokenStreamTestCase.java     | 91 ++++++++++++++++-
 .../lucene/analysis/MockReaderWrapper.java    | 98 +++++++++++++++++++
 .../apache/lucene/analysis/MockTokenizer.java |  7 +-
 .../analysis/ngram/EdgeNGramTokenizer.java    | 20 ++--
 .../lucene/analysis/ngram/NGramTokenizer.java | 18 ++--
 .../icu/segmentation/ICUTokenizer.java        | 22 ++++-
 7 files changed, 242 insertions(+), 18 deletions(-)
 create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java
diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index 58bd49859ca..9f12ab46c36 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -271,6 +271,10 @@ Bug Fixes
  * LUCENE-3831: avoid NPE if the SpanQuery has a null field (eg a
    SpanOrQuery with no clauses added).  (Alan Woodward via Mike
    McCandless).
+
+ * LUCENE-3894: ICUTokenizer, NGramTokenzire and EdgeNGramTokenizer
+   could stop early if the Reader only partially fills the provided
+   buffer 
   
 Documentation
 
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index aaaffa7597a..6fc1d3d589a 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -177,8 +177,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     }
     assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
     ts.end();
-    if (finalOffset != null)
+    if (finalOffset != null) {
       assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
+    }
     if (offsetAtt != null) {
       assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
     }
@@ -391,6 +392,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       List<Integer> startOffsets = new ArrayList<Integer>();
       List<Integer> endOffsets = new ArrayList<Integer>();
       ts.reset();
+
+      // First pass: save away "correct" tokens
       while (ts.incrementToken()) {
         tokens.add(termAtt.toString());
         if (typeAtt != null) types.add(typeAtt.type());
@@ -403,12 +406,98 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       }
       ts.end();
       ts.close();
+
       // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
       if (!tokens.isEmpty()) {
+
+        // KWTokenizer (for example) can produce a token
+        // even when input is length 0:
+        if (text.length() != 0) {
+
+          // (Optional) second pass: do something evil:
+          final int evilness = random.nextInt(50);
+          if (evilness == 17) {
+            if (VERBOSE) {
+              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
+            }
+            // Throw an errant exception from the Reader:
+
+            MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
+            evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
+            reader = evilReader;
+
+            try {
+              // NOTE: some Tokenizers go and read characters
+              // when you call .setReader(Reader), eg
+              // PatternTokenizer.  This is a bit
+              // iffy... (really, they should only
+              // pull from the Reader when you call
+              // .incremenToken(), I think?), but we
+              // currently allow it, so, we must call
+              // a.tokenStream inside the try since we may
+              // hit the exc on init:
+              ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
+              ts.reset();
+              while (ts.incrementToken());
+              fail("did not hit exception");
+            } catch (RuntimeException re) {
+              assertTrue(MockReaderWrapper.isMyEvilException(re));
+            }
+            try {
+              ts.end();
+            } catch (AssertionError ae) {
+              // Catch & ignore MockTokenizer's
+              // anger...
+              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+                // OK
+              } else {
+                throw ae;
+              }
+            }
+            ts.close();
+          } else if (evilness == 7) {
+            // Only consume a subset of the tokens:
+            final int numTokensToRead = random.nextInt(tokens.size());
+            if (VERBOSE) {
+              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
+            }
+
+            reader = new StringReader(text);
+            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+            ts.reset();
+            for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
+              assertTrue(ts.incrementToken());
+            }
+            try {
+              ts.end();
+            } catch (AssertionError ae) {
+              // Catch & ignore MockTokenizer's
+              // anger...
+              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+                // OK
+              } else {
+                throw ae;
+              }
+            }
+            ts.close();
+          }
+        }
+
+        // Final pass: verify clean tokenization matches
+        // results from first pass:
         if (VERBOSE) {
           System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
         }
         reader = new StringReader(text);
+
+        if (random.nextInt(30) == 7) {
+          if (VERBOSE) {
+            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
+          }
+
+          reader = new MockReaderWrapper(random, reader);
+        }
+        
         ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
         if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
           // offset + pos + posLength + type
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java
new file mode 100644
index 00000000000..fe266dcf9d9
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Random;
+
+import org.apache.lucene.util._TestUtil;
+
+/** Wraps a Reader, and can throw random or fixed
+ *  exceptions, and spoon feed read chars. */
+
+public class MockReaderWrapper extends Reader {
+  
+  private final Reader in;
+  private final Random random;
+
+  private int excAtChar = -1;
+  private int readSoFar;
+  private boolean throwExcNext;
+
+  public MockReaderWrapper(Random random, Reader in) {
+    this.in = in;
+    this.random = random;
+  }
+
+  /** Throw an exception after reading this many chars. */
+  public void throwExcAfterChar(int charUpto) {
+    excAtChar = charUpto;
+    // You should only call this on init!:
+    assert readSoFar == 0;
+  }
+
+  public void throwExcNext() {
+    throwExcNext = true;
+  }
+
+  @Override
+  public void close() throws IOException {
+    in.close();
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    if (throwExcNext || (excAtChar != -1 && readSoFar >= excAtChar)) {
+      throw new RuntimeException("fake exception now!");
+    }
+    final int read;
+    final int realLen;
+    if (len == 1) {
+      realLen = 1;
+    } else {
+      // Spoon-feed: intentionally maybe return less than
+      // the consumer asked for
+      realLen = _TestUtil.nextInt(random, 1, len);
+    }
+    if (excAtChar != -1) {
+      final int left = excAtChar - readSoFar;
+      assert left != 0;
+      read = in.read(cbuf, off, Math.min(realLen, left));
+      assert read != -1;
+      readSoFar += read;
+    } else {
+      read = in.read(cbuf, off, realLen);
+    }
+    return read;
+  }
+
+  @Override
+  public boolean markSupported() {
+    return false;
+  }
+
+  @Override
+  public boolean ready() {
+    return false;
+  }
+
+  public static boolean isMyEvilException(Throwable t) {
+    return (t instanceof RuntimeException) && "fake exception now!".equals(t.getMessage());
+  }
+};
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
index 8b14cff4b8b..6a26b8cb248 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
@@ -199,8 +199,11 @@ public class MockTokenizer extends Tokenizer {
     offsetAtt.setOffset(finalOffset, finalOffset);
     // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
     // these tests should disable this check (in general you should consume the entire stream)
-    assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
-    streamState = State.END;
+    try {
+      assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
+    } finally {
+      streamState = State.END;
+    }
   }
 
   /** 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
index 029f4055b11..cfbf4b58295 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@@ -183,15 +183,22 @@ public final class EdgeNGramTokenizer extends Tokenizer {
     // if we are just starting, read the whole input
     if (!started) {
       started = true;
+      gramSize = minGram;
       char[] chars = new char[1024];
-      charsRead = input.read(chars);
-      if (charsRead < 0) {
-        charsRead = inLen = 0;
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      while (charsRead < chars.length) {
+        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          break;
+        }
+        charsRead += inc;
+      }
+      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+      inLen = inStr.length();
+      if (inLen == 0) {
         return false;
       }
-      inStr = new String(chars, 0, charsRead).trim();  // remove any leading or trailing spaces
-      inLen = inStr.length();
-      gramSize = minGram;
     }
 
     // if the remaining input is too short, we can't generate any n-grams
@@ -223,7 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
   @Override
   public void reset(Reader input) throws IOException {
     super.reset(input);
-    reset();
   }
 
   @Override
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index c4109933cb0..d8595cca501 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -105,13 +105,20 @@ public final class NGramTokenizer extends Tokenizer {
       started = true;
       gramSize = minGram;
       char[] chars = new char[1024];
-      charsRead = input.read(chars);
-      if (charsRead < 0) {
-        charsRead = inLen = 0;
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      while (charsRead < chars.length) {
+        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          break;
+        }
+        charsRead += inc;
+      }
+      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+      inLen = inStr.length();
+      if (inLen == 0) {
         return false;
       }
-      inStr = new String(chars).trim();  // remove any trailing empty strings 
-      inLen = inStr.length();
     }
 
     if (pos+gramSize > inLen) {            // if we hit the end of the string
@@ -140,7 +147,6 @@ public final class NGramTokenizer extends Tokenizer {
   @Override
   public void reset(Reader input) throws IOException {
     super.reset(input);
-    reset();
   }
 
   @Override
diff --git a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
index c823bcda992..e6d6437620b 100644
--- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@@ -151,8 +151,8 @@ public final class ICUTokenizer extends Tokenizer {
     int leftover = length - usableLength;
     System.arraycopy(buffer, usableLength, buffer, 0, leftover);
     int requested = buffer.length - leftover;
-    int returned = input.read(buffer, leftover, requested);
-    length = returned < 0 ? leftover : returned + leftover;
+    int returned = read(input, buffer, leftover, requested);
+    length = returned + leftover;
     if (returned < requested) /* reader has been emptied, process the rest */
       usableLength = length;
     else { /* still more data to be read, find a safe-stopping place */
@@ -167,6 +167,24 @@ public final class ICUTokenizer extends Tokenizer {
     breaker.setText(buffer, 0, Math.max(0, usableLength));
   }
 
+  // TODO: refactor to a shared readFully somewhere
+  // (NGramTokenizer does this too):
+  /** commons-io's readFully, but without bugs if offset != 0 */
+  private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
+    assert length >= 0 : "length must not be negative: " + length;
+ 
+    int remaining = length;
+    while ( remaining > 0 ) {
+      int location = length - remaining;
+      int count = input.read( buffer, offset + location, remaining );
+      if ( -1 == count ) { // EOF
+        break;
+      }
+      remaining -= count;
+    }
+    return length - remaining;
+  }
+
   /*
    * return true if there is a token from the buffer, or null if it is
    * exhausted.