mirror of https://github.com/apache/lucene.git
LUCENE-3894: some tokenizers weren't reading all input chars
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303193 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d5683bea96
commit
c20242721f
|
@ -271,6 +271,10 @@ Bug Fixes
|
||||||
* LUCENE-3831: avoid NPE if the SpanQuery has a null field (eg a
|
* LUCENE-3831: avoid NPE if the SpanQuery has a null field (eg a
|
||||||
SpanOrQuery with no clauses added). (Alan Woodward via Mike
|
SpanOrQuery with no clauses added). (Alan Woodward via Mike
|
||||||
McCandless).
|
McCandless).
|
||||||
|
|
||||||
|
* LUCENE-3894: ICUTokenizer, NGramTokenzire and EdgeNGramTokenizer
|
||||||
|
could stop early if the Reader only partially fills the provided
|
||||||
|
buffer
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
|
|
|
@ -177,8 +177,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
|
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
if (finalOffset != null)
|
if (finalOffset != null) {
|
||||||
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
||||||
|
}
|
||||||
if (offsetAtt != null) {
|
if (offsetAtt != null) {
|
||||||
assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
|
assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
|
||||||
}
|
}
|
||||||
|
@ -391,6 +392,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
List<Integer> startOffsets = new ArrayList<Integer>();
|
List<Integer> startOffsets = new ArrayList<Integer>();
|
||||||
List<Integer> endOffsets = new ArrayList<Integer>();
|
List<Integer> endOffsets = new ArrayList<Integer>();
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
|
||||||
|
// First pass: save away "correct" tokens
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
tokens.add(termAtt.toString());
|
tokens.add(termAtt.toString());
|
||||||
if (typeAtt != null) types.add(typeAtt.type());
|
if (typeAtt != null) types.add(typeAtt.type());
|
||||||
|
@ -403,12 +406,98 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
ts.close();
|
||||||
|
|
||||||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||||
if (!tokens.isEmpty()) {
|
if (!tokens.isEmpty()) {
|
||||||
|
|
||||||
|
// KWTokenizer (for example) can produce a token
|
||||||
|
// even when input is length 0:
|
||||||
|
if (text.length() != 0) {
|
||||||
|
|
||||||
|
// (Optional) second pass: do something evil:
|
||||||
|
final int evilness = random.nextInt(50);
|
||||||
|
if (evilness == 17) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
|
||||||
|
}
|
||||||
|
// Throw an errant exception from the Reader:
|
||||||
|
|
||||||
|
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
|
||||||
|
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
|
||||||
|
reader = evilReader;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// NOTE: some Tokenizers go and read characters
|
||||||
|
// when you call .setReader(Reader), eg
|
||||||
|
// PatternTokenizer. This is a bit
|
||||||
|
// iffy... (really, they should only
|
||||||
|
// pull from the Reader when you call
|
||||||
|
// .incremenToken(), I think?), but we
|
||||||
|
// currently allow it, so, we must call
|
||||||
|
// a.tokenStream inside the try since we may
|
||||||
|
// hit the exc on init:
|
||||||
|
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken());
|
||||||
|
fail("did not hit exception");
|
||||||
|
} catch (RuntimeException re) {
|
||||||
|
assertTrue(MockReaderWrapper.isMyEvilException(re));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ts.end();
|
||||||
|
} catch (AssertionError ae) {
|
||||||
|
// Catch & ignore MockTokenizer's
|
||||||
|
// anger...
|
||||||
|
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||||
|
// OK
|
||||||
|
} else {
|
||||||
|
throw ae;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
|
} else if (evilness == 7) {
|
||||||
|
// Only consume a subset of the tokens:
|
||||||
|
final int numTokensToRead = random.nextInt(tokens.size());
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
|
||||||
|
}
|
||||||
|
|
||||||
|
reader = new StringReader(text);
|
||||||
|
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||||
|
ts.reset();
|
||||||
|
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ts.end();
|
||||||
|
} catch (AssertionError ae) {
|
||||||
|
// Catch & ignore MockTokenizer's
|
||||||
|
// anger...
|
||||||
|
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||||
|
// OK
|
||||||
|
} else {
|
||||||
|
throw ae;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final pass: verify clean tokenization matches
|
||||||
|
// results from first pass:
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||||
}
|
}
|
||||||
reader = new StringReader(text);
|
reader = new StringReader(text);
|
||||||
|
|
||||||
|
if (random.nextInt(30) == 7) {
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||||
|
}
|
||||||
|
|
||||||
|
reader = new MockReaderWrapper(random, reader);
|
||||||
|
}
|
||||||
|
|
||||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||||
// offset + pos + posLength + type
|
// offset + pos + posLength + type
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
/** Wraps a Reader, and can throw random or fixed
|
||||||
|
* exceptions, and spoon feed read chars. */
|
||||||
|
|
||||||
|
public class MockReaderWrapper extends Reader {
|
||||||
|
|
||||||
|
private final Reader in;
|
||||||
|
private final Random random;
|
||||||
|
|
||||||
|
private int excAtChar = -1;
|
||||||
|
private int readSoFar;
|
||||||
|
private boolean throwExcNext;
|
||||||
|
|
||||||
|
public MockReaderWrapper(Random random, Reader in) {
|
||||||
|
this.in = in;
|
||||||
|
this.random = random;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Throw an exception after reading this many chars. */
|
||||||
|
public void throwExcAfterChar(int charUpto) {
|
||||||
|
excAtChar = charUpto;
|
||||||
|
// You should only call this on init!:
|
||||||
|
assert readSoFar == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void throwExcNext() {
|
||||||
|
throwExcNext = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
if (throwExcNext || (excAtChar != -1 && readSoFar >= excAtChar)) {
|
||||||
|
throw new RuntimeException("fake exception now!");
|
||||||
|
}
|
||||||
|
final int read;
|
||||||
|
final int realLen;
|
||||||
|
if (len == 1) {
|
||||||
|
realLen = 1;
|
||||||
|
} else {
|
||||||
|
// Spoon-feed: intentionally maybe return less than
|
||||||
|
// the consumer asked for
|
||||||
|
realLen = _TestUtil.nextInt(random, 1, len);
|
||||||
|
}
|
||||||
|
if (excAtChar != -1) {
|
||||||
|
final int left = excAtChar - readSoFar;
|
||||||
|
assert left != 0;
|
||||||
|
read = in.read(cbuf, off, Math.min(realLen, left));
|
||||||
|
assert read != -1;
|
||||||
|
readSoFar += read;
|
||||||
|
} else {
|
||||||
|
read = in.read(cbuf, off, realLen);
|
||||||
|
}
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean markSupported() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean ready() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isMyEvilException(Throwable t) {
|
||||||
|
return (t instanceof RuntimeException) && "fake exception now!".equals(t.getMessage());
|
||||||
|
}
|
||||||
|
};
|
|
@ -199,8 +199,11 @@ public class MockTokenizer extends Tokenizer {
|
||||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||||
// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
|
// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
|
||||||
// these tests should disable this check (in general you should consume the entire stream)
|
// these tests should disable this check (in general you should consume the entire stream)
|
||||||
assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
|
try {
|
||||||
streamState = State.END;
|
assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!";
|
||||||
|
} finally {
|
||||||
|
streamState = State.END;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -183,15 +183,22 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
||||||
// if we are just starting, read the whole input
|
// if we are just starting, read the whole input
|
||||||
if (!started) {
|
if (!started) {
|
||||||
started = true;
|
started = true;
|
||||||
|
gramSize = minGram;
|
||||||
char[] chars = new char[1024];
|
char[] chars = new char[1024];
|
||||||
charsRead = input.read(chars);
|
charsRead = 0;
|
||||||
if (charsRead < 0) {
|
// TODO: refactor to a shared readFully somewhere:
|
||||||
charsRead = inLen = 0;
|
while (charsRead < chars.length) {
|
||||||
|
int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||||
|
if (inc == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
charsRead += inc;
|
||||||
|
}
|
||||||
|
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
||||||
|
inLen = inStr.length();
|
||||||
|
if (inLen == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
inStr = new String(chars, 0, charsRead).trim(); // remove any leading or trailing spaces
|
|
||||||
inLen = inStr.length();
|
|
||||||
gramSize = minGram;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the remaining input is too short, we can't generate any n-grams
|
// if the remaining input is too short, we can't generate any n-grams
|
||||||
|
@ -223,7 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader input) throws IOException {
|
public void reset(Reader input) throws IOException {
|
||||||
super.reset(input);
|
super.reset(input);
|
||||||
reset();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -105,13 +105,20 @@ public final class NGramTokenizer extends Tokenizer {
|
||||||
started = true;
|
started = true;
|
||||||
gramSize = minGram;
|
gramSize = minGram;
|
||||||
char[] chars = new char[1024];
|
char[] chars = new char[1024];
|
||||||
charsRead = input.read(chars);
|
charsRead = 0;
|
||||||
if (charsRead < 0) {
|
// TODO: refactor to a shared readFully somewhere:
|
||||||
charsRead = inLen = 0;
|
while (charsRead < chars.length) {
|
||||||
|
int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||||
|
if (inc == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
charsRead += inc;
|
||||||
|
}
|
||||||
|
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
||||||
|
inLen = inStr.length();
|
||||||
|
if (inLen == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
inStr = new String(chars).trim(); // remove any trailing empty strings
|
|
||||||
inLen = inStr.length();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos+gramSize > inLen) { // if we hit the end of the string
|
if (pos+gramSize > inLen) { // if we hit the end of the string
|
||||||
|
@ -140,7 +147,6 @@ public final class NGramTokenizer extends Tokenizer {
|
||||||
@Override
|
@Override
|
||||||
public void reset(Reader input) throws IOException {
|
public void reset(Reader input) throws IOException {
|
||||||
super.reset(input);
|
super.reset(input);
|
||||||
reset();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -151,8 +151,8 @@ public final class ICUTokenizer extends Tokenizer {
|
||||||
int leftover = length - usableLength;
|
int leftover = length - usableLength;
|
||||||
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
|
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
|
||||||
int requested = buffer.length - leftover;
|
int requested = buffer.length - leftover;
|
||||||
int returned = input.read(buffer, leftover, requested);
|
int returned = read(input, buffer, leftover, requested);
|
||||||
length = returned < 0 ? leftover : returned + leftover;
|
length = returned + leftover;
|
||||||
if (returned < requested) /* reader has been emptied, process the rest */
|
if (returned < requested) /* reader has been emptied, process the rest */
|
||||||
usableLength = length;
|
usableLength = length;
|
||||||
else { /* still more data to be read, find a safe-stopping place */
|
else { /* still more data to be read, find a safe-stopping place */
|
||||||
|
@ -167,6 +167,24 @@ public final class ICUTokenizer extends Tokenizer {
|
||||||
breaker.setText(buffer, 0, Math.max(0, usableLength));
|
breaker.setText(buffer, 0, Math.max(0, usableLength));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: refactor to a shared readFully somewhere
|
||||||
|
// (NGramTokenizer does this too):
|
||||||
|
/** commons-io's readFully, but without bugs if offset != 0 */
|
||||||
|
private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
|
||||||
|
assert length >= 0 : "length must not be negative: " + length;
|
||||||
|
|
||||||
|
int remaining = length;
|
||||||
|
while ( remaining > 0 ) {
|
||||||
|
int location = length - remaining;
|
||||||
|
int count = input.read( buffer, offset + location, remaining );
|
||||||
|
if ( -1 == count ) { // EOF
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
remaining -= count;
|
||||||
|
}
|
||||||
|
return length - remaining;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* return true if there is a token from the buffer, or null if it is
|
* return true if there is a token from the buffer, or null if it is
|
||||||
* exhausted.
|
* exhausted.
|
||||||
|
|
Loading…
Reference in New Issue