LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input

This commit is contained in:
Mike McCandless 2017-02-21 10:51:38 -05:00
parent ac38872a79
commit 2d03aa21a2
4 changed files with 25 additions and 6 deletions

View File

@ -135,13 +135,12 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
} while (state != -1);
if (lastAcceptLength != -1) {
// strip the trailing separater we just matched from the token:
tokenUpto -= lastAcceptLength;
// we found a token separator
// we found a token separator; strip the trailing separator we just matched from the token:
int extra = sepUpto - lastAcceptLength;
if (extra != 0) {
pushBack(extra);
}
tokenUpto -= lastAcceptLength;
if (tokenUpto > 0) {
fillToken(offsetStart);
return true;
@ -187,14 +186,14 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
tokenUpto -= count;
assert tokenUpto >= 0;
if (pendingLimit == 0) {
if (bufferNextRead >= count) {
if (bufferLimit != -1 && bufferNextRead >= count) {
// optimize common case when the chars we are pushing back are still in the buffer
bufferNextRead -= count;
} else {
if (count > pendingChars.length) {
pendingChars = ArrayUtil.grow(pendingChars, count);
}
System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
System.arraycopy(termAtt.buffer(), tokenUpto, pendingChars, 0, count);
pendingLimit = count;
}
} else {

View File

@ -172,7 +172,7 @@ public final class SimplePatternTokenizer extends Tokenizer {
private void pushBack(int count) {
if (pendingLimit == 0) {
if (bufferNextRead >= count) {
if (bufferLimit != -1 && bufferNextRead >= count) {
// optimize common case when the chars we are pushing back are still in the buffer
bufferNextRead -= count;
} else {

View File

@ -270,4 +270,14 @@ public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
b.close();
}
public void testEndLookahead() throws Exception {
Tokenizer t = new SimplePatternSplitTokenizer("(ab)+");
t.setReader(new StringReader("aba"));
assertTokenStreamContents(t,
new String[] { "a" },
new int[] { 2 },
new int[] { 3 },
3);
}
}

View File

@ -215,4 +215,14 @@ public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
b.close();
}
public void testEndLookahead() throws Exception {
Tokenizer t = new SimplePatternTokenizer("(ab)+");
t.setReader(new StringReader("aba"));
assertTokenStreamContents(t,
new String[] { "ab" },
new int[] { 0 },
new int[] { 2 },
3);
}
}