LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)

Normalization-inert characters need not be required as boundaries
for incremental processing. It is sufficient to check `hasBoundaryAfter`
and `hasBoundaryBefore`, substantially improving worst-case performance.
This commit is contained in:
Michael Gibney 2021-07-13 21:26:40 -04:00 committed by GitHub
parent caa822ff38
commit c3482c99ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 130 additions and 20 deletions

View File

@ -378,6 +378,10 @@ Improvements
* LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring * LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
them to provide an ExecutorService. (Greg Miller) them to provide an ExecutorService. (Greg Miller)
* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
characters as boundaries for incremental processing, vastly improving worst-case
performance. (Michael Gibney)
Optimizations Optimizations
--------------------- ---------------------
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in * LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in

View File

@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
private final CharacterUtils.CharacterBuffer tmpBuffer; private final CharacterUtils.CharacterBuffer tmpBuffer;
private void readInputToBuffer() throws IOException { private void readInputToBuffer() throws IOException {
while (true) {
// CharacterUtils.fill is supplementary char aware // CharacterUtils.fill is supplementary char aware
final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input); if (!CharacterUtils.fill(tmpBuffer, input)) {
inputFinished = true;
}
assert tmpBuffer.getOffset() == 0; assert tmpBuffer.getOffset() == 0;
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength()); inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
if (hasRemainingChars == false) {
inputFinished = true;
break;
}
final int lastCodePoint =
Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
if (normalizer.isInert(lastCodePoint)) {
// we require an inert char so that we can normalize content before and
// after this character independently
break;
}
}
// if checkedInputBoundary was at the end of a buffer, we need to check that char again // if checkedInputBoundary was at the end of a buffer, we need to check that char again
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0); checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
} }
@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
} }
if (!afterQuickCheckYes) { if (!afterQuickCheckYes) {
int resLen = readFromInputWhileSpanQuickCheckYes(); int resLen = readFromInputWhileSpanQuickCheckYes();
afterQuickCheckYes = true;
if (resLen > 0) return resLen; if (resLen > 0) return resLen;
} }
int resLen = readFromIoNormalizeUptoBoundary(); int resLen = readFromIoNormalizeUptoBoundary();
@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
} }
private int readFromInputWhileSpanQuickCheckYes() { private int readFromInputWhileSpanQuickCheckYes() {
afterQuickCheckYes = true;
int end = normalizer.spanQuickCheckYes(inputBuffer); int end = normalizer.spanQuickCheckYes(inputBuffer);
if (end > 0) { if (end > 0) {
int cp;
if (end == inputBuffer.length()
&& !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
/*
our quickCheckYes result is valid thru the end of current buffer, but we need to back off
because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
canonical ordering of combining characters across the buffer boundary.
*/
afterQuickCheckYes = false;
end -= Character.charCount(cp);
// NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
// `Normalizer2.hasBoundaryAfter()`:
// "Note that this operation may be significantly slower than hasBoundaryBefore()"
while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
cp = inputBuffer.codePointBefore(end);
end -= Character.charCount(cp);
}
if (end == 0) {
return 0;
}
}
resultBuffer.append(inputBuffer.subSequence(0, end)); resultBuffer.append(inputBuffer.subSequence(0, end));
inputBuffer.delete(0, end); inputBuffer.delete(0, end);
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0); checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);

View File

@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
input.length()); input.length());
} }
/**
* A wrapping reader that provides transparency wrt how much of the underlying stream has been
* consumed. This may be used to validate incremental behavior in upstream reads.
*/
private static class TransparentReader extends Reader {
private final int expectSize;
private final Reader backing;
private int cursorPosition = -1;
private boolean finished = false;
private TransparentReader(Reader backing, int expectSize) {
this.expectSize = expectSize;
this.backing = backing;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
final int ret = backing.read(cbuf, off, len);
if (cursorPosition == -1) {
if (ret != -1) {
cursorPosition = ret;
} else {
cursorPosition = 0;
}
} else if (ret == -1) {
assert finished;
} else {
cursorPosition += ret;
}
if (!finished && cursorPosition >= expectSize) {
finished = true;
}
return ret;
}
@Override
public void close() throws IOException {
assert finished && cursorPosition != -1;
backing.close();
}
}
public void testIncrementalNoInertChars() throws Exception {
final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
final int inputLength = input.length();
final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
// Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
for (int i = inputLength - 1; i >= 0; i--) {
assertFalse(norm.isInert(Character.codePointAt(input, i)));
}
final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
char[] tempBuff = new char[outerBufferSize];
/*
Sanity check that with the default inner buffer size, the upstream input is read in its
entirety after the first external read.
*/
TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
assertTrue(reader.read(tempBuff) > 0);
assertEquals(inputLength, tr.cursorPosition);
assertTrue(tr.finished);
/*
By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
even for input with no inert chars, we're able to read input incrementally. To support incremental
upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
*/
final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
tr = new TransparentReader(new StringReader(input), inputLength);
reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
StringBuilder result = new StringBuilder();
int incrementalReads = 0;
int finishedReads = 0;
while (true) {
int length = reader.read(tempBuff);
if (length == -1) {
assertTrue(tr.finished);
break;
}
result.append(tempBuff, 0, length);
if (length > 0) {
if (!tr.finished) {
incrementalReads++;
assertTrue(tr.cursorPosition < inputLength);
} else {
finishedReads++;
}
}
}
assertTrue(incrementalReads > finishedReads);
assertEquals(expectOutput, result.toString());
}
public void testMassiveLigature() throws IOException { public void testMassiveLigature() throws IOException {
String input = "\uFDFA"; String input = "\uFDFA";