LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input

2017-02-21 10:51:38 -05:00 · 2017-02-21 10:51:38 -05:00 · 2d03aa21a2
parent ac38872a79
commit 2d03aa21a2
4 changed files with 25 additions and 6 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
@ -135,13 +135,12 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
        } while (state != -1);
        
        if (lastAcceptLength != -1) {
-          // strip the trailing separater we just matched from the token:
-          tokenUpto -= lastAcceptLength;
-          // we found a token separator
+          // we found a token separator; strip the trailing separator we just matched from the token:
          int extra = sepUpto - lastAcceptLength;
          if (extra != 0) {
            pushBack(extra);
          }
+          tokenUpto -= lastAcceptLength;
          if (tokenUpto > 0) {
            fillToken(offsetStart);
            return true;
@ -187,14 +186,14 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
    tokenUpto -= count;
    assert tokenUpto >= 0;
    if (pendingLimit == 0) {
-      if (bufferNextRead >= count) {
+      if (bufferLimit != -1 && bufferNextRead >= count) {
        // optimize common case when the chars we are pushing back are still in the buffer
        bufferNextRead -= count;
      } else {
        if (count > pendingChars.length) {
          pendingChars = ArrayUtil.grow(pendingChars, count);
        }
-        System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+        System.arraycopy(termAtt.buffer(), tokenUpto, pendingChars, 0, count);
        pendingLimit = count;
      }
    } else {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
@ -172,7 +172,7 @@ public final class SimplePatternTokenizer extends Tokenizer {
  private void pushBack(int count) {
    
    if (pendingLimit == 0) {
-      if (bufferNextRead >= count) {
+      if (bufferLimit != -1 && bufferNextRead >= count) {
        // optimize common case when the chars we are pushing back are still in the buffer
        bufferNextRead -= count;
      } else {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
@ -270,4 +270,14 @@ public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
    checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
    b.close();
  }
+
+  public void testEndLookahead() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("(ab)+");
+    t.setReader(new StringReader("aba"));
+    assertTokenStreamContents(t,
+        new String[] { "a" },
+        new int[] { 2 },
+        new int[] { 3 },
+        3);
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
@ -215,4 +215,14 @@ public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
    checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
    b.close();
  }
+
+  public void testEndLookahead() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("(ab)+");
+    t.setReader(new StringReader("aba"));
+    assertTokenStreamContents(t,
+        new String[] { "ab" },
+        new int[] { 0 },
+        new int[] { 2 },
+        3);
+  }
 }