mirror of https://github.com/apache/lucene.git
LUCENE-9177: ICUNormalizer2CharFilter streaming no longer depends on presence of normalization-inert characters (#199)
Normalization-inert characters need not be required as boundaries for incremental processing. It is sufficient to check `hasBoundaryAfter` and `hasBoundaryBefore`, substantially improving worst-case performance.
This commit is contained in:
parent
caa822ff38
commit
c3482c99ff
|
@ -378,6 +378,10 @@ Improvements
|
||||||
* LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
|
* LUCENE-9944: Allow DrillSideways users to provide their own CollectorManager without also requiring
|
||||||
them to provide an ExecutorService. (Greg Miller)
|
them to provide an ExecutorService. (Greg Miller)
|
||||||
|
|
||||||
|
* LUCENE-9177: ICUNormalizer2CharFilter no longer requires normalization-inert
|
||||||
|
characters as boundaries for incremental processing, vastly improving worst-case
|
||||||
|
performance. (Michael Gibney)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
|
* LUCENE-9996: Improved memory efficiency of IndexWriter's RAM buffer, in
|
||||||
|
|
|
@ -93,27 +93,14 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
||||||
private final CharacterUtils.CharacterBuffer tmpBuffer;
|
private final CharacterUtils.CharacterBuffer tmpBuffer;
|
||||||
|
|
||||||
private void readInputToBuffer() throws IOException {
|
private void readInputToBuffer() throws IOException {
|
||||||
while (true) {
|
// CharacterUtils.fill is supplementary char aware
|
||||||
// CharacterUtils.fill is supplementary char aware
|
if (!CharacterUtils.fill(tmpBuffer, input)) {
|
||||||
final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
|
inputFinished = true;
|
||||||
|
|
||||||
assert tmpBuffer.getOffset() == 0;
|
|
||||||
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
|
|
||||||
|
|
||||||
if (hasRemainingChars == false) {
|
|
||||||
inputFinished = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int lastCodePoint =
|
|
||||||
Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
|
|
||||||
if (normalizer.isInert(lastCodePoint)) {
|
|
||||||
// we require an inert char so that we can normalize content before and
|
|
||||||
// after this character independently
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
assert tmpBuffer.getOffset() == 0;
|
||||||
|
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
|
||||||
|
|
||||||
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
|
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
|
||||||
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
|
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
|
||||||
}
|
}
|
||||||
|
@ -125,7 +112,6 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
if (!afterQuickCheckYes) {
|
if (!afterQuickCheckYes) {
|
||||||
int resLen = readFromInputWhileSpanQuickCheckYes();
|
int resLen = readFromInputWhileSpanQuickCheckYes();
|
||||||
afterQuickCheckYes = true;
|
|
||||||
if (resLen > 0) return resLen;
|
if (resLen > 0) return resLen;
|
||||||
}
|
}
|
||||||
int resLen = readFromIoNormalizeUptoBoundary();
|
int resLen = readFromIoNormalizeUptoBoundary();
|
||||||
|
@ -136,8 +122,30 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
private int readFromInputWhileSpanQuickCheckYes() {
|
private int readFromInputWhileSpanQuickCheckYes() {
|
||||||
|
afterQuickCheckYes = true;
|
||||||
int end = normalizer.spanQuickCheckYes(inputBuffer);
|
int end = normalizer.spanQuickCheckYes(inputBuffer);
|
||||||
if (end > 0) {
|
if (end > 0) {
|
||||||
|
int cp;
|
||||||
|
if (end == inputBuffer.length()
|
||||||
|
&& !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
|
||||||
|
/*
|
||||||
|
our quickCheckYes result is valid thru the end of current buffer, but we need to back off
|
||||||
|
because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
|
||||||
|
canonical ordering of combining characters across the buffer boundary.
|
||||||
|
*/
|
||||||
|
afterQuickCheckYes = false;
|
||||||
|
end -= Character.charCount(cp);
|
||||||
|
// NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
|
||||||
|
// `Normalizer2.hasBoundaryAfter()`:
|
||||||
|
// "Note that this operation may be significantly slower than hasBoundaryBefore()"
|
||||||
|
while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
|
||||||
|
cp = inputBuffer.codePointBefore(end);
|
||||||
|
end -= Character.charCount(cp);
|
||||||
|
}
|
||||||
|
if (end == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
resultBuffer.append(inputBuffer.subSequence(0, end));
|
resultBuffer.append(inputBuffer.subSequence(0, end));
|
||||||
inputBuffer.delete(0, end);
|
inputBuffer.delete(0, end);
|
||||||
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
|
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
|
||||||
|
|
|
@ -94,6 +94,104 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
input.length());
|
input.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A wrapping reader that provides transparency wrt how much of the underlying stream has been
|
||||||
|
* consumed. This may be used to validate incremental behavior in upstream reads.
|
||||||
|
*/
|
||||||
|
private static class TransparentReader extends Reader {
|
||||||
|
|
||||||
|
private final int expectSize;
|
||||||
|
private final Reader backing;
|
||||||
|
private int cursorPosition = -1;
|
||||||
|
private boolean finished = false;
|
||||||
|
|
||||||
|
private TransparentReader(Reader backing, int expectSize) {
|
||||||
|
this.expectSize = expectSize;
|
||||||
|
this.backing = backing;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
final int ret = backing.read(cbuf, off, len);
|
||||||
|
if (cursorPosition == -1) {
|
||||||
|
if (ret != -1) {
|
||||||
|
cursorPosition = ret;
|
||||||
|
} else {
|
||||||
|
cursorPosition = 0;
|
||||||
|
}
|
||||||
|
} else if (ret == -1) {
|
||||||
|
assert finished;
|
||||||
|
} else {
|
||||||
|
cursorPosition += ret;
|
||||||
|
}
|
||||||
|
if (!finished && cursorPosition >= expectSize) {
|
||||||
|
finished = true;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
assert finished && cursorPosition != -1;
|
||||||
|
backing.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIncrementalNoInertChars() throws Exception {
|
||||||
|
final String input = "℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa℃aa";
|
||||||
|
final int inputLength = input.length();
|
||||||
|
final String expectOutput = "°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa°caa";
|
||||||
|
Normalizer2 norm = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
|
||||||
|
|
||||||
|
// Sanity check/demonstrate that the input doesn't have any normalization-inert codepoints
|
||||||
|
for (int i = inputLength - 1; i >= 0; i--) {
|
||||||
|
assertFalse(norm.isInert(Character.codePointAt(input, i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
final int outerBufferSize = random().nextInt(8) + 1; // buffer size between 1 <= n <= 8
|
||||||
|
char[] tempBuff = new char[outerBufferSize];
|
||||||
|
|
||||||
|
/*
|
||||||
|
Sanity check that with the default inner buffer size, the upstream input is read in its
|
||||||
|
entirety after the first external read.
|
||||||
|
*/
|
||||||
|
TransparentReader tr = new TransparentReader(new StringReader(input), inputLength);
|
||||||
|
CharFilter reader = new ICUNormalizer2CharFilter(tr, norm);
|
||||||
|
assertTrue(reader.read(tempBuff) > 0);
|
||||||
|
assertEquals(inputLength, tr.cursorPosition);
|
||||||
|
assertTrue(tr.finished);
|
||||||
|
|
||||||
|
/*
|
||||||
|
By way of contrast with the "control" above, we now check the actual desired behavior, ensuring that
|
||||||
|
even for input with no inert chars, we're able to read input incrementally. To support incremental
|
||||||
|
upstream reads with reasonable-sized input, we artificially reduce the size of the internal buffer
|
||||||
|
*/
|
||||||
|
final int innerBufferSize = random().nextInt(7) + 2; // buffer size between 2 <= n <= 8
|
||||||
|
tr = new TransparentReader(new StringReader(input), inputLength);
|
||||||
|
reader = new ICUNormalizer2CharFilter(tr, norm, innerBufferSize);
|
||||||
|
StringBuilder result = new StringBuilder();
|
||||||
|
int incrementalReads = 0;
|
||||||
|
int finishedReads = 0;
|
||||||
|
while (true) {
|
||||||
|
int length = reader.read(tempBuff);
|
||||||
|
if (length == -1) {
|
||||||
|
assertTrue(tr.finished);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result.append(tempBuff, 0, length);
|
||||||
|
if (length > 0) {
|
||||||
|
if (!tr.finished) {
|
||||||
|
incrementalReads++;
|
||||||
|
assertTrue(tr.cursorPosition < inputLength);
|
||||||
|
} else {
|
||||||
|
finishedReads++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(incrementalReads > finishedReads);
|
||||||
|
assertEquals(expectOutput, result.toString());
|
||||||
|
}
|
||||||
|
|
||||||
public void testMassiveLigature() throws IOException {
|
public void testMassiveLigature() throws IOException {
|
||||||
String input = "\uFDFA";
|
String input = "\uFDFA";
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue