diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 59f5a722a1d..41fe5f022c7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -378,6 +378,10 @@ Improvements * LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring them to provide an ExecutorService. (Greg Miller) +* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert + characters as boundaries for incremental processing, vastly improving worst-case + performance. (Michael Gibney) + Optimizations --------------------- * LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java index 2992a99faca..dfb4989aa9d 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java @@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { private final CharacterUtils.CharacterBuffer tmpBuffer; private void readInputToBuffer() throws IOException { - while (true) { - // CharacterUtils.fill is supplementary char aware - final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input); - - assert tmpBuffer.getOffset() == 0; - inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength()); - - if (hasRemainingChars == false) { - inputFinished = true; - break; - } - - final int lastCodePoint = - Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0); - if (normalizer.isInert(lastCodePoint)) { - // we require an inert char so that we can normalize content before and - // after this character independently - break; - } + // CharacterUtils.fill is supplementary char aware + if (!CharacterUtils.fill(tmpBuffer, input)) { + inputFinished = true; } + assert tmpBuffer.getOffset() == 0; + inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength()); + // if checkedInputBoundary was at the end of a buffer, we need to check that char again checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0); } @@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { } if (!afterQuickCheckYes) { int resLen = readFromInputWhileSpanQuickCheckYes(); - afterQuickCheckYes = true; if (resLen > 0) return resLen; } int resLen = readFromIoNormalizeUptoBoundary(); @@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { } private int readFromInputWhileSpanQuickCheckYes() { + afterQuickCheckYes = true; int end = normalizer.spanQuickCheckYes(inputBuffer); if (end > 0) { + int cp; + if (end == inputBuffer.length() + && !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) { + /* + our quickCheckYes result is valid thru the end of current buffer, but we need to back off + because we're not at a normalization boundary. At a minimum this is relevant wrt imposing + canonical ordering of combining characters across the buffer boundary. + */ + afterQuickCheckYes = false; + end -= Character.charCount(cp); + // NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for + // `Normalizer2.hasBoundaryAfter()`: + // "Note that this operation may be significantly slower than hasBoundaryBefore()" + while (end > 0 && !normalizer.hasBoundaryBefore(cp)) { + cp = inputBuffer.codePointBefore(end); + end -= Character.charCount(cp); + } + if (end == 0) { + return 0; + } + } resultBuffer.append(inputBuffer.subSequence(0, end)); inputBuffer.delete(0, end); checkedInputBoundary = Math.max(checkedInputBoundary - end, 0); diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java index bb1240cfd87..79b1dfe68f6 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java @@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { input.length()); } + /** + * A wrapping reader that provides transparency wrt how much of the underlying stream has been + * consumed. This may be used to validate incremental behavior in upstream reads. + */ + private static class TransparentReader extends Reader { + + private final int expectSize; + private final Reader backing; + private int cursorPosition = -1; + private boolean finished = false; + + private TransparentReader(Reader backing, int expectSize) { + this.expectSize = expectSize; + this.backing = backing; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + final int ret = backing.read(cbuf, off, len); + if (cursorPosition == -1) { + if (ret != -1) { + cursorPosition = ret; + } else { + cursorPosition = 0; + } + } else if (ret == -1) { + assert finished; + } else { + cursorPosition += ret; + } + if (!finished && cursorPosition >= expectSize) { + finished = true; + } + return ret; + } + + @Override + public void close() throws IOException { + assert finished && cursorPosition != -1; + backing.close(); + } + } + + public void testIncrementalNoInertChars() throws Exception { + final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa"; + final int inputLength = input.length(); + final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa"; + Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); + + // Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints + for (int i = inputLength - 1; i >= 0; i--) { + assertFalse(norm.isInert(Character.codePointAt(input, i))); + } + + final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8 + char[] tempBuff = new char[outerBufferSize]; + + /* + Sanity check that with the default inner buffer size, the upstream input is read in its + entirety after the first external read. + */ + TransparentReader tr = new TransparentReader(new StringReader(input), inputLength); + CharFilter reader = new ICUNormalizer2CharFilter(tr, norm); + assertTrue(reader.read(tempBuff) > 0); + assertEquals(inputLength, tr.cursorPosition); + assertTrue(tr.finished); + + /* + By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that + even for input with no inert chars, we're able to read input incrementally. To support incremental + upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer + */ + final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8 + tr = new TransparentReader(new StringReader(input), inputLength); + reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize); + StringBuilder result = new StringBuilder(); + int incrementalReads = 0; + int finishedReads = 0; + while (true) { + int length = reader.read(tempBuff); + if (length == -1) { + assertTrue(tr.finished); + break; + } + result.append(tempBuff, 0, length); + if (length > 0) { + if (!tr.finished) { + incrementalReads++; + assertTrue(tr.cursorPosition < inputLength); + } else { + finishedReads++; + } + } + } + assertTrue(incrementalReads > finishedReads); + assertEquals(expectOutput, result.toString()); + } + public void testMassiveLigature() throws IOException { String input = "\uFDFA";