Fix repeating token sentence boundary bug (#11734)

Signed-off-by: lkotzaniewsk <lkotzaniewsk@bloomberg.net> Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
2022-09-23 06:59:46 -04:00 · 2022-09-23 06:59:46 -04:00 · 3a04aa44c2
parent 5b24a233bd
commit 3a04aa44c2
13 changed files with 471 additions and 119 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -110,6 +110,12 @@ Bug Fixes
  trying to apply a dictionary whose size is greater than the maximum supported
  window size for LZ4. (Adrien Grand)

+* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
+  (Luke Kot-Zaniewski)
+
+* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
+  (Luke Kot-Zaniewski)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
@ -36,76 +36,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
 */
@IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
 public final class OpenNLPChunkerFilter extends TokenFilter {
-
-  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
  private int tokenNum = 0;
-  private boolean moreTokensAvailable = true;
-  private String[] sentenceTerms = null;
-  private String[] sentenceTermPOSTags = null;
-
  private final NLPChunkerOp chunkerOp;
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final SentenceAttributeExtractor sentenceAttributeExtractor;

  public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
    super(input);
    this.chunkerOp = chunkerOp;
+    sentenceAttributeExtractor =
+        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
  }

  @Override
-  public final boolean incrementToken() throws IOException {
-    if (!moreTokensAvailable) {
-      clear();
-      return false;
-    }
-    if (tokenNum == sentenceTokenAttrs.size()) {
-      nextSentence();
-      if (sentenceTerms == null) {
-        clear();
+  public boolean incrementToken() throws IOException {
+    List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
+    boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
+    if (isEndOfCurrentSentence) {
+      boolean noSentencesLeft =
+          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+      if (noSentencesLeft) {
        return false;
      }
-      assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
-      tokenNum = 0;
    }
    clearAttributes();
    sentenceTokenAttrs.get(tokenNum++).copyTo(this);
    return true;
  }

-  private void nextSentence() throws IOException {
+  private List<AttributeSource> nextSentence() throws IOException {
+    tokenNum = 0;
    List<String> termList = new ArrayList<>();
    List<String> posTagList = new ArrayList<>();
-    sentenceTokenAttrs.clear();
-    boolean endOfSentence = false;
-    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
-      termList.add(termAtt.toString());
-      posTagList.add(typeAtt.type());
-      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-      sentenceTokenAttrs.add(input.cloneAttributes());
+    for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
+      termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+      posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
    }
-    sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
-    sentenceTermPOSTags =
-        posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
+    String[] sentenceTerms = termList.toArray(new String[0]);
+    String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
+    assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+    return sentenceAttributeExtractor.getSentenceAttributes();
  }

  private void assignTokenTypes(String[] tags) {
    for (int i = 0; i < tags.length; ++i) {
-      sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
+      sentenceAttributeExtractor
+          .getSentenceAttributes()
+          .get(i)
+          .getAttribute(TypeAttribute.class)
+          .setType(tags[i]);
    }
  }

  @Override
  public void reset() throws IOException {
    super.reset();
-    moreTokensAvailable = true;
+    sentenceAttributeExtractor.reset();
    clear();
  }

  private void clear() {
-    sentenceTokenAttrs.clear();
-    sentenceTerms = null;
-    sentenceTermPOSTags = null;
    tokenNum = 0;
  }
 }
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@ -24,10 +24,7 @@ import java.util.List;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;

@ -46,37 +43,28 @@ import org.apache.lucene.util.IgnoreRandomChains;
 public class OpenNLPLemmatizerFilter extends TokenFilter {
  private final NLPLemmatizerOp lemmatizerOp;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
  private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
-  private boolean moreTokensAvailable = true;
-  private String[] sentenceTokens = null; // non-keyword tokens
-  private String[] sentenceTokenTypes = null; // types for non-keyword tokens
-  private String[] lemmas = null; // lemmas for non-keyword tokens
+  private final SentenceAttributeExtractor sentenceAttributeExtractor;
+  private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
  private int lemmaNum = 0; // lemma counter

  public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
    super(input);
    this.lemmatizerOp = lemmatizerOp;
+    sentenceAttributeExtractor =
+        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
  }

  @Override
  public final boolean incrementToken() throws IOException {
-    if (!moreTokensAvailable) {
-      clear();
-      return false;
-    }
-    if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
-      nextSentence();
-      if (sentenceTokens == null) { // zero non-keyword tokens
-        clear();
+    boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
+    if (isEndOfCurrentSentence) {
+      boolean noSentencesLeft =
+          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+      if (noSentencesLeft) {
        return false;
      }
-      lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
-      lemmaNum = 0;
-      sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
    }
    clearAttributes();
    sentenceTokenAttrsIter.next().copyTo(this);
@ -86,36 +74,35 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
    return true;
  }

-  private void nextSentence() throws IOException {
+  private List<AttributeSource> nextSentence() throws IOException {
+    lemmaNum = 0;
    List<String> tokenList = new ArrayList<>();
    List<String> typeList = new ArrayList<>();
-    sentenceTokenAttrs.clear();
-    boolean endOfSentence = false;
-    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
-      if (!keywordAtt.isKeyword()) {
-        tokenList.add(termAtt.toString());
-        typeList.add(typeAtt.type());
+    List<AttributeSource> sentenceAttributes =
+        sentenceAttributeExtractor.extractSentenceAttributes();
+    for (AttributeSource attributeSource : sentenceAttributes) {
+      if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
+        tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+        typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
      }
-      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-      sentenceTokenAttrs.add(input.cloneAttributes());
    }
-    sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
-    sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
+    String[] sentenceTokens = tokenList.toArray(new String[0]);
+    String[] sentenceTokenTypes = typeList.toArray(new String[0]);
+    lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+    sentenceTokenAttrsIter = sentenceAttributes.iterator();
+    return sentenceAttributeExtractor.getSentenceAttributes();
  }

  @Override
  public void reset() throws IOException {
    super.reset();
-    moreTokensAvailable = true;
+    sentenceAttributeExtractor.reset();
    clear();
  }

  private void clear() {
-    sentenceTokenAttrs.clear();
    sentenceTokenAttrsIter = null;
-    sentenceTokens = null;
-    sentenceTokenTypes = null;
-    lemmas = null;
+    lemmas = new String[0];
    lemmaNum = 0;
  }
 }
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IgnoreRandomChains;
@ -33,65 +33,62 @@ import org.apache.lucene.util.IgnoreRandomChains;
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
 public final class OpenNLPPOSFilter extends TokenFilter {

-  private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
-  String[] tags = null;
  private int tokenNum = 0;
-  private boolean moreTokensAvailable = true;
-
  private final NLPPOSTaggerOp posTaggerOp;
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final SentenceAttributeExtractor sentenceAttributeExtractor;

  public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
    super(input);
    this.posTaggerOp = posTaggerOp;
+    sentenceAttributeExtractor =
+        new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
  }

  @Override
-  public final boolean incrementToken() throws IOException {
-    if (!moreTokensAvailable) {
-      clear();
-      return false;
-    }
-    if (tokenNum
-        == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
-      String[] sentenceTokens = nextSentence();
-      if (sentenceTokens == null) {
-        clear();
+  public boolean incrementToken() throws IOException {
+    List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
+    boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
+    if (isEndOfCurrentSentence) {
+      boolean noSentencesLeft =
+          sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
+      if (noSentencesLeft) {
        return false;
      }
-      tags = posTaggerOp.getPOSTags(sentenceTokens);
-      tokenNum = 0;
    }
    clearAttributes();
-    sentenceTokenAttrs.get(tokenNum).copyTo(this);
-    typeAtt.setType(tags[tokenNum++]);
+    sentenceTokenAttrs.get(tokenNum++).copyTo(this);
    return true;
  }

-  private String[] nextSentence() throws IOException {
+  private List<AttributeSource> nextSentence() throws IOException {
+    tokenNum = 0;
    List<String> termList = new ArrayList<>();
-    sentenceTokenAttrs.clear();
-    boolean endOfSentence = false;
-    while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
-      termList.add(termAtt.toString());
-      endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-      sentenceTokenAttrs.add(input.cloneAttributes());
+    for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
+      termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
+    }
+    String[] sentenceTerms = termList.toArray(new String[0]);
+    assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
+    return sentenceAttributeExtractor.getSentenceAttributes();
+  }
+
+  private void assignTokenTypes(String[] tags) {
+    for (int i = 0; i < tags.length; ++i) {
+      sentenceAttributeExtractor
+          .getSentenceAttributes()
+          .get(i)
+          .getAttribute(TypeAttribute.class)
+          .setType(tags[i]);
    }
-    return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
  }

  @Override
  public void reset() throws IOException {
    super.reset();
-    moreTokensAvailable = true;
+    sentenceAttributeExtractor.reset();
    clear();
  }

  private void clear() {
-    sentenceTokenAttrs.clear();
-    tags = null;
    tokenNum = 0;
  }
 }
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@ -22,28 +22,27 @@ import opennlp.tools.util.Span;
 import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
 import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.IgnoreRandomChains;

 /**
- * Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
- * the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
- * apply operations to tokens one sentence at a time.
+ * Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
+ * SentenceAttribute.
 */
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
 public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
-  public static int EOS_FLAG_BIT = 1;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);

  private Span[] termSpans = null;
  private int termNum = 0;
  private int sentenceStart = 0;
+  private int sentenceIndex = -1;

  private NLPTokenizerOp tokenizerOp = null;

@ -71,6 +70,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
    String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
    termSpans = tokenizerOp.getTerms(sentenceText);
    termNum = 0;
+    sentenceIndex++;
  }

  @Override
@ -84,11 +84,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
    offsetAtt.setOffset(
        correctOffset(offset + sentenceStart + term.getStart()),
        correctOffset(offset + sentenceStart + term.getEnd()));
-    if (termNum == termSpans.length - 1) {
-      flagsAtt.setFlags(
-          flagsAtt.getFlags()
-              | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
-    }
+    sentenceAtt.setSentenceIndex(sentenceIndex);
    ++termNum;
    return true;
  }
@ -98,5 +94,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
    super.reset();
    termSpans = null;
    termNum = sentenceStart = 0;
+    sentenceIndex = -1;
  }
 }
--- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/SentenceAttributeExtractor.java
@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
+ * central location to be used by other sentence-aware components.
+ *
+ * <p>May want to consider making this its own Filter so that extracted sentence token attributes
+ * can be shared by downstream sentence-aware filters.
+ */
+public class SentenceAttributeExtractor {
+
+  private final TokenStream input;
+  private final SentenceAttribute sentenceAtt;
+  private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+  private AttributeSource prevAttributeSource;
+  private int currSentence = 0;
+  private boolean hasNextToken = true;
+
+  public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
+    this.input = input;
+    this.sentenceAtt = sentenceAtt;
+  }
+
+  // If this class were a stand-alone filter it could conceivably extract the attributes once
+  // and cache a reference to those attributes in SentenceAttribute. That way downstream filters
+  // could read the full sentence without having to independently extract it.
+  public List<AttributeSource> extractSentenceAttributes() throws IOException {
+    sentenceTokenAttrs.clear();
+    boolean hasNext;
+    do {
+      hasNextToken = input.incrementToken();
+      int currSentenceTmp = sentenceAtt.getSentenceIndex();
+      hasNext = (currSentence == currSentenceTmp && hasNextToken);
+      currSentence = currSentenceTmp;
+      if (prevAttributeSource != null) {
+        sentenceTokenAttrs.add(prevAttributeSource);
+      }
+      prevAttributeSource = input.cloneAttributes();
+    } while (hasNext);
+    return sentenceTokenAttrs;
+  }
+
+  public List<AttributeSource> getSentenceAttributes() {
+    return sentenceTokenAttrs;
+  }
+
+  public boolean allSentencesProcessed() {
+    return !hasNextToken;
+  }
+
+  public void reset() {
+    hasNextToken = true;
+    sentenceTokenAttrs.clear();
+    currSentence = 0;
+    prevAttributeSource = null;
+  }
+}
--- a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-input.txt
@ -0,0 +1,23 @@
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+Quick brown fox jumped over the lazy dog.
+x
+This should hopefully get analyzed.
+x
+And so should this.
--- a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/data/early-exit-bug-output.txt
@ -0,0 +1,32 @@
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
+x x
+This This should should hopefully hopefully get get analyzed analyzed . .
+x x
+And And so so should should this this . .
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@ -114,4 +114,16 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
        true,
        toPayloads(SENTENCES_chunks));
  }
+
+  public void testEmptyField() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+            .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+            .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+            .build();
+    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+  }
 }
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@ -17,6 +17,11 @@

 package org.apache.lucene.analysis.opennlp;

+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.stream.Collectors;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
@ -108,6 +113,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
    "IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
  };

+  private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
+
+  private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
+
  private static final String tokenizerModelFile = "en-test-tokenizer.bin";
  private static final String sentenceModelFile = "en-test-sent.bin";
  private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
@ -290,4 +299,77 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
        null,
        true);
  }
+
+  public void testNoBreakWithRepeatKeywordFilter() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+            .build();
+    assertAnalyzesTo(
+        analyzer,
+        NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
+        NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
+        null,
+        null,
+        null,
+        null,
+        null,
+        true);
+  }
+
+  // checks for bug described in https://github.com/apache/lucene/issues/11771
+  public void testPreventEarlyExit() throws IOException {
+    InputStream earlyExitInput = null;
+    InputStream earlyExitOutput = null;
+    try {
+      ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
+      earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
+      String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
+      earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
+      String earlyExitOutputText =
+          new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
+      String[] earlyExitOutputTexts =
+          Arrays.stream(earlyExitOutputText.split("\\s"))
+              .filter(text -> text != "")
+              .collect(Collectors.joining(" "))
+              .split(" ");
+      CustomAnalyzer analyzer =
+          CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+              .withTokenizer(
+                  "opennlp",
+                  "tokenizerModel",
+                  tokenizerModelFile,
+                  "sentenceModel",
+                  sentenceModelFile)
+              .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+              .addTokenFilter(KeywordRepeatFilterFactory.class)
+              .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+              .build();
+      assertAnalyzesTo(
+          analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
+    } finally {
+      if (earlyExitInput != null) {
+        earlyExitInput.close();
+      }
+      if (earlyExitOutput != null) {
+        earlyExitOutput.close();
+      }
+    }
+  }
+
+  public void testEmptyField() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+            .build();
+    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+  }
 }
--- a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@ -21,6 +21,7 @@ import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
 import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.util.ClasspathResourceLoader;
@ -66,6 +67,7 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
  private static final String[] NO_BREAK_terms = {"No", "period"};
  private static final int[] NO_BREAK_startOffsets = {0, 3};
  private static final int[] NO_BREAK_endOffsets = {2, 9};
+  private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};

  private static final String sentenceModelFile = "en-test-sent.bin";
  private static final String tokenizerModelFile = "en-test-tokenizer.bin";
@ -144,4 +146,26 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
        null,
        true);
  }
+
+  public void testNoBreakWithRepeatKeywordFilter() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+            .build();
+    assertAnalyzesTo(
+        analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
+  }
+
+  public void testEmptyField() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+            .build();
+    assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttribute.java
@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.util.Attribute;
+
+/**
+ * This attribute tracks what sentence a given token belongs to as well as potentially other
+ * sentence specific attributes.
+ */
+public interface SentenceAttribute extends Attribute {
+
+  /**
+   * Get the sentence index for the current token
+   *
+   * @return The index of the sentence
+   * @see #getSentenceIndex()
+   */
+  int getSentenceIndex();
+
+  /**
+   * Set the sentence of the current token
+   *
+   * @see #setSentenceIndex(int sentenceIndex)
+   */
+  void setSentenceIndex(int sentenceIndex);
+}
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/SentenceAttributeImpl.java
@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.tokenattributes;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/**
+ * Default implementation of {@link SentenceAttribute}.
+ *
+ * <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
+ * decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
+ * class may hold other sentence specific data in the future.
+ */
+public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
+
+  private int index = 0;
+
+  /** Initialize this attribute to default */
+  public SentenceAttributeImpl() {}
+
+  @Override
+  public void clear() {
+    index = 0;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (this == other) {
+      return true;
+    }
+
+    if (other instanceof SentenceAttributeImpl) {
+      return ((SentenceAttributeImpl) other).index == index;
+    }
+
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return index;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    SentenceAttribute t = (SentenceAttribute) target;
+    t.setSentenceIndex(index);
+  }
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(SentenceAttribute.class, "sentences", index);
+  }
+
+  @Override
+  public int getSentenceIndex() {
+    return index;
+  }
+
+  @Override
+  public void setSentenceIndex(int sentence) {
+    this.index = sentence;
+  }
+}