mirror of https://github.com/apache/lucene.git
LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input
This commit is contained in:
parent
ac38872a79
commit
2d03aa21a2
|
@ -135,13 +135,12 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
|
|||
} while (state != -1);
|
||||
|
||||
if (lastAcceptLength != -1) {
|
||||
// strip the trailing separater we just matched from the token:
|
||||
tokenUpto -= lastAcceptLength;
|
||||
// we found a token separator
|
||||
// we found a token separator; strip the trailing separator we just matched from the token:
|
||||
int extra = sepUpto - lastAcceptLength;
|
||||
if (extra != 0) {
|
||||
pushBack(extra);
|
||||
}
|
||||
tokenUpto -= lastAcceptLength;
|
||||
if (tokenUpto > 0) {
|
||||
fillToken(offsetStart);
|
||||
return true;
|
||||
|
@ -187,14 +186,14 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
|
|||
tokenUpto -= count;
|
||||
assert tokenUpto >= 0;
|
||||
if (pendingLimit == 0) {
|
||||
if (bufferNextRead >= count) {
|
||||
if (bufferLimit != -1 && bufferNextRead >= count) {
|
||||
// optimize common case when the chars we are pushing back are still in the buffer
|
||||
bufferNextRead -= count;
|
||||
} else {
|
||||
if (count > pendingChars.length) {
|
||||
pendingChars = ArrayUtil.grow(pendingChars, count);
|
||||
}
|
||||
System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
|
||||
System.arraycopy(termAtt.buffer(), tokenUpto, pendingChars, 0, count);
|
||||
pendingLimit = count;
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -172,7 +172,7 @@ public final class SimplePatternTokenizer extends Tokenizer {
|
|||
private void pushBack(int count) {
|
||||
|
||||
if (pendingLimit == 0) {
|
||||
if (bufferNextRead >= count) {
|
||||
if (bufferLimit != -1 && bufferNextRead >= count) {
|
||||
// optimize common case when the chars we are pushing back are still in the buffer
|
||||
bufferNextRead -= count;
|
||||
} else {
|
||||
|
|
|
@ -270,4 +270,14 @@ public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
b.close();
|
||||
}
|
||||
|
||||
public void testEndLookahead() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("(ab)+");
|
||||
t.setReader(new StringReader("aba"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] { "a" },
|
||||
new int[] { 2 },
|
||||
new int[] { 3 },
|
||||
3);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -215,4 +215,14 @@ public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
b.close();
|
||||
}
|
||||
|
||||
public void testEndLookahead() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer("(ab)+");
|
||||
t.setReader(new StringReader("aba"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] { "ab" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 },
|
||||
3);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue