LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)

Normalization-inert characters need not be required as boundaries for incremental processing. It is sufficient to check `hasBoundaryAfter` and `hasBoundaryBefore`, substantially improving worst-case performance.
2025-02-28 05:19:17 +00:00 · 2021-07-13 21:26:40 -04:00 · 2021-07-13 21:26:40 -04:00 · c3482c99ff
commit c3482c99ff
parent caa822ff38
3 changed files with 130 additions and 20 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -378,6 +378,10 @@ Improvements
 * LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
  them to provide an ExecutorService. (Greg Miller)

+* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
+  characters as boundaries for incremental processing, vastly improving worst-case
+  performance. (Michael Gibney)
+
 Optimizations
 ---------------------
 * LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
  private final CharacterUtils.CharacterBuffer tmpBuffer;

  private void readInputToBuffer() throws IOException {
-    while (true) {
-      // CharacterUtils.fill is supplementary char aware
-      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
-
-      assert tmpBuffer.getOffset() == 0;
-      inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
-
-      if (hasRemainingChars == false) {
-        inputFinished = true;
-        break;
-      }
-
-      final int lastCodePoint =
-          Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
-      if (normalizer.isInert(lastCodePoint)) {
-        // we require an inert char so that we can normalize content before and
-        // after this character independently
-        break;
-      }
+    // CharacterUtils.fill is supplementary char aware
+    if (!CharacterUtils.fill(tmpBuffer, input)) {
+      inputFinished = true;
    }

+    assert tmpBuffer.getOffset() == 0;
+    inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
+
    // if checkedInputBoundary was at the end of a buffer, we need to check that char again
    checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
  }
@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
    }
    if (!afterQuickCheckYes) {
      int resLen = readFromInputWhileSpanQuickCheckYes();
-      afterQuickCheckYes = true;
      if (resLen > 0) return resLen;
    }
    int resLen = readFromIoNormalizeUptoBoundary();
@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
  }

  private int readFromInputWhileSpanQuickCheckYes() {
+    afterQuickCheckYes = true;
    int end = normalizer.spanQuickCheckYes(inputBuffer);
    if (end > 0) {
+      int cp;
+      if (end == inputBuffer.length()
+          && !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
+        /*
+        our quickCheckYes result is valid thru the end of current buffer, but we need to back off
+        because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
+        canonical ordering of combining characters across the buffer boundary.
+         */
+        afterQuickCheckYes = false;
+        end -= Character.charCount(cp);
+        // NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
+        // `Normalizer2.hasBoundaryAfter()`:
+        // "Note that this operation may be significantly slower than hasBoundaryBefore()"
+        while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
+          cp = inputBuffer.codePointBefore(end);
+          end -= Character.charCount(cp);
+        }
+        if (end == 0) {
+          return 0;
+        }
+      }
      resultBuffer.append(inputBuffer.subSequence(0, end));
      inputBuffer.delete(0, end);
      checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
        input.length());
  }

+  /**
+   * A wrapping reader that provides transparency wrt how much of the underlying stream has been
+   * consumed. This may be used to validate incremental behavior in upstream reads.
+   */
+  private static class TransparentReader extends Reader {
+
+    private final int expectSize;
+    private final Reader backing;
+    private int cursorPosition = -1;
+    private boolean finished = false;
+
+    private TransparentReader(Reader backing, int expectSize) {
+      this.expectSize = expectSize;
+      this.backing = backing;
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      final int ret = backing.read(cbuf, off, len);
+      if (cursorPosition == -1) {
+        if (ret != -1) {
+          cursorPosition = ret;
+        } else {
+          cursorPosition = 0;
+        }
+      } else if (ret == -1) {
+        assert finished;
+      } else {
+        cursorPosition += ret;
+      }
+      if (!finished && cursorPosition >= expectSize) {
+        finished = true;
+      }
+      return ret;
+    }
+
+    @Override
+    public void close() throws IOException {
+      assert finished && cursorPosition != -1;
+      backing.close();
+    }
+  }
+
+  public void testIncrementalNoInertChars() throws Exception {
+    final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
+    final int inputLength = input.length();
+    final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
+    Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+
+    // Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
+    for (int i = inputLength - 1; i >= 0; i--) {
+      assertFalse(norm.isInert(Character.codePointAt(input, i)));
+    }
+
+    final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
+    char[] tempBuff = new char[outerBufferSize];
+
+    /*
+    Sanity check that with the default inner buffer size, the upstream input is read in its
+    entirety after the first external read.
+     */
+    TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
+    CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
+    assertTrue(reader.read(tempBuff) > 0);
+    assertEquals(inputLength, tr.cursorPosition);
+    assertTrue(tr.finished);
+
+    /*
+    By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
+    even for input with no inert chars, we're able to read input incrementally. To support incremental
+    upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
+     */
+    final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
+    tr = new TransparentReader(new StringReader(input), inputLength);
+    reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
+    StringBuilder result = new StringBuilder();
+    int incrementalReads = 0;
+    int finishedReads = 0;
+    while (true) {
+      int length = reader.read(tempBuff);
+      if (length == -1) {
+        assertTrue(tr.finished);
+        break;
+      }
+      result.append(tempBuff, 0, length);
+      if (length > 0) {
+        if (!tr.finished) {
+          incrementalReads++;
+          assertTrue(tr.cursorPosition < inputLength);
+        } else {
+          finishedReads++;
+        }
+      }
+    }
+    assertTrue(incrementalReads > finishedReads);
+    assertEquals(expectOutput, result.toString());
+  }
+
  public void testMassiveLigature() throws IOException {
    String input = "\uFDFA";