LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)

Normalization-inert characters need not be required as boundaries for incremental processing. It is sufficient to check `hasBoundaryAfter` and `hasBoundaryBefore`, substantially improving worst-case performance.
2021-07-13 21:26:40 -04:00 · 2021-07-13 21:26:40 -04:00 · c3482c99ff
parent caa822ff38
commit c3482c99ff
3 changed files with 130 additions and 20 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -378,6 +378,10 @@ Improvements
 * LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
  them to provide an ExecutorService. (Greg Miller)
 * LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
  characters as boundaries for incremental processing, vastly improving worst-case
  performance. (Michael Gibney)
 Optimizations
 ---------------------
 * LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
  private final CharacterUtils.CharacterBuffer tmpBuffer;
  private void readInputToBuffer() throws IOException {
    while (true) {
    // CharacterUtils.fill is supplementary char aware
-      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
+    if (!CharacterUtils.fill(tmpBuffer, input)) {
      inputFinished = true;
    }
    assert tmpBuffer.getOffset() == 0;
    inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
      if (hasRemainingChars == false) {
        inputFinished = true;
        break;
      }
      final int lastCodePoint =
          Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
      if (normalizer.isInert(lastCodePoint)) {
        // we require an inert char so that we can normalize content before and
        // after this character independently
        break;
      }
    }
    // if checkedInputBoundary was at the end of a buffer, we need to check that char again
    checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
  }
@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
    }
    if (!afterQuickCheckYes) {
      int resLen = readFromInputWhileSpanQuickCheckYes();
      afterQuickCheckYes = true;
      if (resLen > 0) return resLen;
    }
    int resLen = readFromIoNormalizeUptoBoundary();
@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
  }
  private int readFromInputWhileSpanQuickCheckYes() {
    afterQuickCheckYes = true;
    int end = normalizer.spanQuickCheckYes(inputBuffer);
    if (end > 0) {
      int cp;
      if (end == inputBuffer.length()
          && !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
        /*
        our quickCheckYes result is valid thru the end of current buffer, but we need to back off
        because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
        canonical ordering of combining characters across the buffer boundary.
         */
        afterQuickCheckYes = false;
        end -= Character.charCount(cp);
        // NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
        // `Normalizer2.hasBoundaryAfter()`:
        // "Note that this operation may be significantly slower than hasBoundaryBefore()"
        while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
          cp = inputBuffer.codePointBefore(end);
          end -= Character.charCount(cp);
        }
        if (end == 0) {
          return 0;
        }
      }
      resultBuffer.append(inputBuffer.subSequence(0, end));
      inputBuffer.delete(0, end);
      checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
        input.length());
  }
  /**
   * A wrapping reader that provides transparency wrt how much of the underlying stream has been
   * consumed. This may be used to validate incremental behavior in upstream reads.
   */
  private static class TransparentReader extends Reader {
    private final int expectSize;
    private final Reader backing;
    private int cursorPosition = -1;
    private boolean finished = false;
    private TransparentReader(Reader backing, int expectSize) {
      this.expectSize = expectSize;
      this.backing = backing;
    }
    @Override
    public int read(char[] cbuf, int off, int len) throws IOException {
      final int ret = backing.read(cbuf, off, len);
      if (cursorPosition == -1) {
        if (ret != -1) {
          cursorPosition = ret;
        } else {
          cursorPosition = 0;
        }
      } else if (ret == -1) {
        assert finished;
      } else {
        cursorPosition += ret;
      }
      if (!finished && cursorPosition >= expectSize) {
        finished = true;
      }
      return ret;
    }
    @Override
    public void close() throws IOException {
      assert finished && cursorPosition != -1;
      backing.close();
    }
  }
  public void testIncrementalNoInertChars() throws Exception {
    final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
    final int inputLength = input.length();
    final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
    Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
    // Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
    for (int i = inputLength - 1; i >= 0; i--) {
      assertFalse(norm.isInert(Character.codePointAt(input, i)));
    }
    final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
    char[] tempBuff = new char[outerBufferSize];
    /*
    Sanity check that with the default inner buffer size, the upstream input is read in its
    entirety after the first external read.
     */
    TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
    CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
    assertTrue(reader.read(tempBuff) > 0);
    assertEquals(inputLength, tr.cursorPosition);
    assertTrue(tr.finished);
    /*
    By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
    even for input with no inert chars, we're able to read input incrementally. To support incremental
    upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
     */
    final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
    tr = new TransparentReader(new StringReader(input), inputLength);
    reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
    StringBuilder result = new StringBuilder();
    int incrementalReads = 0;
    int finishedReads = 0;
    while (true) {
      int length = reader.read(tempBuff);
      if (length == -1) {
        assertTrue(tr.finished);
        break;
      }
      result.append(tempBuff, 0, length);
      if (length > 0) {
        if (!tr.finished) {
          incrementalReads++;
          assertTrue(tr.cursorPosition < inputLength);
        } else {
          finishedReads++;
        }
      }
    }
    assertTrue(incrementalReads > finishedReads);
    assertEquals(expectOutput, result.toString());
  }
  public void testMassiveLigature() throws IOException {
    String input = "\uFDFA";