From c20242721f9ee1c061c9323cee36c31e0605f61f Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 20 Mar 2012 23:02:37 +0000 Subject: [PATCH] LUCENE-3894: some tokenizers weren't reading all input chars git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303193 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 4 + .../analysis/BaseTokenStreamTestCase.java | 91 ++++++++++++++++- .../lucene/analysis/MockReaderWrapper.java | 98 +++++++++++++++++++ .../apache/lucene/analysis/MockTokenizer.java | 7 +- .../analysis/ngram/EdgeNGramTokenizer.java | 20 ++-- .../lucene/analysis/ngram/NGramTokenizer.java | 18 ++-- .../icu/segmentation/ICUTokenizer.java | 22 ++++- 7 files changed, 242 insertions(+), 18 deletions(-) create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/MockReaderWrapper.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 58bd49859ca..9f12ab46c36 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -271,6 +271,10 @@ Bug Fixes * LUCENE-3831: avoid NPE if the SpanQuery has a null field (eg a SpanOrQuery with no clauses added). (Alan Woodward via Mike McCandless). + + * LUCENE-3894: ICUTokenizer, NGramTokenzire and EdgeNGramTokenizer + could stop early if the Reader only partially fills the provided + buffer Documentation diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index aaaffa7597a..6fc1d3d589a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -177,8 +177,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } assertFalse("TokenStream has more tokens than expected", ts.incrementToken()); ts.end(); - if (finalOffset != null) + if (finalOffset != null) { assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); + } if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } @@ -391,6 +392,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { List startOffsets = new ArrayList(); List endOffsets = new ArrayList(); ts.reset(); + + // First pass: save away "correct" tokens while (ts.incrementToken()) { tokens.add(termAtt.toString()); if (typeAtt != null) types.add(typeAtt.type()); @@ -403,12 +406,98 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } ts.end(); ts.close(); + // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) { + + // KWTokenizer (for example) can produce a token + // even when input is length 0: + if (text.length() != 0) { + + // (Optional) second pass: do something evil: + final int evilness = random.nextInt(50); + if (evilness == 17) { + if (VERBOSE) { + System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception"); + } + // Throw an errant exception from the Reader: + + MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text)); + evilReader.throwExcAfterChar(random.nextInt(text.length()+1)); + reader = evilReader; + + try { + // NOTE: some Tokenizers go and read characters + // when you call .setReader(Reader), eg + // PatternTokenizer. This is a bit + // iffy... (really, they should only + // pull from the Reader when you call + // .incremenToken(), I think?), but we + // currently allow it, so, we must call + // a.tokenStream inside the try since we may + // hit the exc on init: + ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader); + ts.reset(); + while (ts.incrementToken()); + fail("did not hit exception"); + } catch (RuntimeException re) { + assertTrue(MockReaderWrapper.isMyEvilException(re)); + } + try { + ts.end(); + } catch (AssertionError ae) { + // Catch & ignore MockTokenizer's + // anger... + if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) { + // OK + } else { + throw ae; + } + } + ts.close(); + } else if (evilness == 7) { + // Only consume a subset of the tokens: + final int numTokensToRead = random.nextInt(tokens.size()); + if (VERBOSE) { + System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens"); + } + + reader = new StringReader(text); + ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); + ts.reset(); + for(int tokenCount=0;tokenCount= excAtChar)) { + throw new RuntimeException("fake exception now!"); + } + final int read; + final int realLen; + if (len == 1) { + realLen = 1; + } else { + // Spoon-feed: intentionally maybe return less than + // the consumer asked for + realLen = _TestUtil.nextInt(random, 1, len); + } + if (excAtChar != -1) { + final int left = excAtChar - readSoFar; + assert left != 0; + read = in.read(cbuf, off, Math.min(realLen, left)); + assert read != -1; + readSoFar += read; + } else { + read = in.read(cbuf, off, realLen); + } + return read; + } + + @Override + public boolean markSupported() { + return false; + } + + @Override + public boolean ready() { + return false; + } + + public static boolean isMyEvilException(Throwable t) { + return (t instanceof RuntimeException) && "fake exception now!".equals(t.getMessage()); + } +}; diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java index 8b14cff4b8b..6a26b8cb248 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java @@ -199,8 +199,11 @@ public class MockTokenizer extends Tokenizer { offsetAtt.setOffset(finalOffset, finalOffset); // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. // these tests should disable this check (in general you should consume the entire stream) - assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!"; - streamState = State.END; + try { + assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!"; + } finally { + streamState = State.END; + } } /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index 029f4055b11..cfbf4b58295 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -183,15 +183,22 @@ public final class EdgeNGramTokenizer extends Tokenizer { // if we are just starting, read the whole input if (!started) { started = true; + gramSize = minGram; char[] chars = new char[1024]; - charsRead = input.read(chars); - if (charsRead < 0) { - charsRead = inLen = 0; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + while (charsRead < chars.length) { + int inc = input.read(chars, charsRead, chars.length-charsRead); + if (inc == -1) { + break; + } + charsRead += inc; + } + inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings + inLen = inStr.length(); + if (inLen == 0) { return false; } - inStr = new String(chars, 0, charsRead).trim(); // remove any leading or trailing spaces - inLen = inStr.length(); - gramSize = minGram; } // if the remaining input is too short, we can't generate any n-grams @@ -223,7 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer { @Override public void reset(Reader input) throws IOException { super.reset(input); - reset(); } @Override diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index c4109933cb0..d8595cca501 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -105,13 +105,20 @@ public final class NGramTokenizer extends Tokenizer { started = true; gramSize = minGram; char[] chars = new char[1024]; - charsRead = input.read(chars); - if (charsRead < 0) { - charsRead = inLen = 0; + charsRead = 0; + // TODO: refactor to a shared readFully somewhere: + while (charsRead < chars.length) { + int inc = input.read(chars, charsRead, chars.length-charsRead); + if (inc == -1) { + break; + } + charsRead += inc; + } + inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings + inLen = inStr.length(); + if (inLen == 0) { return false; } - inStr = new String(chars).trim(); // remove any trailing empty strings - inLen = inStr.length(); } if (pos+gramSize > inLen) { // if we hit the end of the string @@ -140,7 +147,6 @@ public final class NGramTokenizer extends Tokenizer { @Override public void reset(Reader input) throws IOException { super.reset(input); - reset(); } @Override diff --git a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java index c823bcda992..e6d6437620b 100644 --- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java +++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java @@ -151,8 +151,8 @@ public final class ICUTokenizer extends Tokenizer { int leftover = length - usableLength; System.arraycopy(buffer, usableLength, buffer, 0, leftover); int requested = buffer.length - leftover; - int returned = input.read(buffer, leftover, requested); - length = returned < 0 ? leftover : returned + leftover; + int returned = read(input, buffer, leftover, requested); + length = returned + leftover; if (returned < requested) /* reader has been emptied, process the rest */ usableLength = length; else { /* still more data to be read, find a safe-stopping place */ @@ -167,6 +167,24 @@ public final class ICUTokenizer extends Tokenizer { breaker.setText(buffer, 0, Math.max(0, usableLength)); } + // TODO: refactor to a shared readFully somewhere + // (NGramTokenizer does this too): + /** commons-io's readFully, but without bugs if offset != 0 */ + private static int read(Reader input, char[] buffer, int offset, int length) throws IOException { + assert length >= 0 : "length must not be negative: " + length; + + int remaining = length; + while ( remaining > 0 ) { + int location = length - remaining; + int count = input.read( buffer, offset + location, remaining ); + if ( -1 == count ) { // EOF + break; + } + remaining -= count; + } + return length - remaining; + } + /* * return true if there is a token from the buffer, or null if it is * exhausted.