mirror of https://github.com/apache/lucene.git
LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)
Normalization-inert characters need not be required as boundaries for incremental processing. It is sufficient to check `hasBoundaryAfter` and `hasBoundaryBefore`, substantially improving worst-case performance.
This commit is contained in:
parent
caa822ff38
commit
c3482c99ff
|
@ -378,6 +378,10 @@ Improvements
|
|||
* LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
|
||||
them to provide an ExecutorService. (Greg Miller)
|
||||
|
||||
* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
|
||||
characters as boundaries for incremental processing, vastly improving worst-case
|
||||
performance. (Michael Gibney)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
|
||||
|
|
|
@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
|||
private final CharacterUtils.CharacterBuffer tmpBuffer;
|
||||
|
||||
private void readInputToBuffer() throws IOException {
|
||||
while (true) {
|
||||
// CharacterUtils.fill is supplementary char aware
|
||||
final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
|
||||
|
||||
assert tmpBuffer.getOffset() == 0;
|
||||
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
|
||||
|
||||
if (hasRemainingChars == false) {
|
||||
inputFinished = true;
|
||||
break;
|
||||
}
|
||||
|
||||
final int lastCodePoint =
|
||||
Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
|
||||
if (normalizer.isInert(lastCodePoint)) {
|
||||
// we require an inert char so that we can normalize content before and
|
||||
// after this character independently
|
||||
break;
|
||||
}
|
||||
// CharacterUtils.fill is supplementary char aware
|
||||
if (!CharacterUtils.fill(tmpBuffer, input)) {
|
||||
inputFinished = true;
|
||||
}
|
||||
|
||||
assert tmpBuffer.getOffset() == 0;
|
||||
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
|
||||
|
||||
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
|
||||
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
|
||||
}
|
||||
|
@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
|||
}
|
||||
if (!afterQuickCheckYes) {
|
||||
int resLen = readFromInputWhileSpanQuickCheckYes();
|
||||
afterQuickCheckYes = true;
|
||||
if (resLen > 0) return resLen;
|
||||
}
|
||||
int resLen = readFromIoNormalizeUptoBoundary();
|
||||
|
@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
|||
}
|
||||
|
||||
private int readFromInputWhileSpanQuickCheckYes() {
|
||||
afterQuickCheckYes = true;
|
||||
int end = normalizer.spanQuickCheckYes(inputBuffer);
|
||||
if (end > 0) {
|
||||
int cp;
|
||||
if (end == inputBuffer.length()
|
||||
&& !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
|
||||
/*
|
||||
our quickCheckYes result is valid thru the end of current buffer, but we need to back off
|
||||
because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
|
||||
canonical ordering of combining characters across the buffer boundary.
|
||||
*/
|
||||
afterQuickCheckYes = false;
|
||||
end -= Character.charCount(cp);
|
||||
// NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
|
||||
// `Normalizer2.hasBoundaryAfter()`:
|
||||
// "Note that this operation may be significantly slower than hasBoundaryBefore()"
|
||||
while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
|
||||
cp = inputBuffer.codePointBefore(end);
|
||||
end -= Character.charCount(cp);
|
||||
}
|
||||
if (end == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
resultBuffer.append(inputBuffer.subSequence(0, end));
|
||||
inputBuffer.delete(0, end);
|
||||
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
|
||||
|
|
|
@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
|||
input.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapping reader that provides transparency wrt how much of the underlying stream has been
|
||||
* consumed. This may be used to validate incremental behavior in upstream reads.
|
||||
*/
|
||||
private static class TransparentReader extends Reader {
|
||||
|
||||
private final int expectSize;
|
||||
private final Reader backing;
|
||||
private int cursorPosition = -1;
|
||||
private boolean finished = false;
|
||||
|
||||
private TransparentReader(Reader backing, int expectSize) {
|
||||
this.expectSize = expectSize;
|
||||
this.backing = backing;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
final int ret = backing.read(cbuf, off, len);
|
||||
if (cursorPosition == -1) {
|
||||
if (ret != -1) {
|
||||
cursorPosition = ret;
|
||||
} else {
|
||||
cursorPosition = 0;
|
||||
}
|
||||
} else if (ret == -1) {
|
||||
assert finished;
|
||||
} else {
|
||||
cursorPosition += ret;
|
||||
}
|
||||
if (!finished && cursorPosition >= expectSize) {
|
||||
finished = true;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
assert finished && cursorPosition != -1;
|
||||
backing.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testIncrementalNoInertChars() throws Exception {
|
||||
final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
|
||||
final int inputLength = input.length();
|
||||
final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
|
||||
Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
|
||||
|
||||
// Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
|
||||
for (int i = inputLength - 1; i >= 0; i--) {
|
||||
assertFalse(norm.isInert(Character.codePointAt(input, i)));
|
||||
}
|
||||
|
||||
final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
|
||||
char[] tempBuff = new char[outerBufferSize];
|
||||
|
||||
/*
|
||||
Sanity check that with the default inner buffer size, the upstream input is read in its
|
||||
entirety after the first external read.
|
||||
*/
|
||||
TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
|
||||
CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
|
||||
assertTrue(reader.read(tempBuff) > 0);
|
||||
assertEquals(inputLength, tr.cursorPosition);
|
||||
assertTrue(tr.finished);
|
||||
|
||||
/*
|
||||
By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
|
||||
even for input with no inert chars, we're able to read input incrementally. To support incremental
|
||||
upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
|
||||
*/
|
||||
final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
|
||||
tr = new TransparentReader(new StringReader(input), inputLength);
|
||||
reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
|
||||
StringBuilder result = new StringBuilder();
|
||||
int incrementalReads = 0;
|
||||
int finishedReads = 0;
|
||||
while (true) {
|
||||
int length = reader.read(tempBuff);
|
||||
if (length == -1) {
|
||||
assertTrue(tr.finished);
|
||||
break;
|
||||
}
|
||||
result.append(tempBuff, 0, length);
|
||||
if (length > 0) {
|
||||
if (!tr.finished) {
|
||||
incrementalReads++;
|
||||
assertTrue(tr.cursorPosition < inputLength);
|
||||
} else {
|
||||
finishedReads++;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertTrue(incrementalReads > finishedReads);
|
||||
assertEquals(expectOutput, result.toString());
|
||||
}
|
||||
|
||||
public void testMassiveLigature() throws IOException {
|
||||
String input = "\uFDFA";
|
||||
|
||||
|
|
Loading…
Reference in New Issue