Clean up writing String to ByteBuffersDataOutput (#12455)

Resolving TODO to use UnicodeUtil instead of a copy of its code here.
Maybe slightly slower from the extra check for high-surrogate but that
may be outweigh or better by more compact code and saving the capturing lambda
that might not inline.
This commit is contained in:
Armin Braun 2023-07-26 14:25:01 +02:00 committed by GitHub
parent 87944c2aa7
commit 538b7d0ffe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 18 additions and 71 deletions

View File

@ -28,7 +28,6 @@ import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
import java.util.function.IntFunction;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BitUtil;
@ -410,25 +409,17 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
}
}
private static final int MAX_CHARS_PER_WINDOW = 1024;
@Override
public void writeString(String v) {
try {
final int MAX_CHARS_PER_WINDOW = 1024;
if (v.length() <= MAX_CHARS_PER_WINDOW) {
final BytesRef utf8 = new BytesRef(v);
writeVInt(utf8.length);
writeBytes(utf8.bytes, utf8.offset, utf8.length);
} else {
writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
final byte[] buf = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
UTF16toUTF8(
v,
0,
v.length(),
buf,
(len) -> {
writeBytes(buf, 0, len);
});
writeLongString(v);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
@ -549,66 +540,22 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
return blockBits;
}
// TODO: move this block-based conversion to UnicodeUtil.
private static final long HALF_SHIFT = 10;
private static final int SURROGATE_OFFSET =
Character.MIN_SUPPLEMENTARY_CODE_POINT
- (UnicodeUtil.UNI_SUR_HIGH_START << HALF_SHIFT)
- UnicodeUtil.UNI_SUR_LOW_START;
/** A consumer-based UTF16-UTF8 encoder (writes the input string in smaller buffers.). */
private static int UTF16toUTF8(
final CharSequence s,
final int offset,
final int length,
byte[] buf,
IntConsumer bufferFlusher) {
int utf8Len = 0;
int j = 0;
for (int i = offset, end = offset + length; i < end; i++) {
final int chr = (int) s.charAt(i);
if (j + 4 >= buf.length) {
bufferFlusher.accept(j);
utf8Len += j;
j = 0;
}
if (chr < 0x80) buf[j++] = (byte) chr;
else if (chr < 0x800) {
buf[j++] = (byte) (0xC0 | (chr >> 6));
buf[j++] = (byte) (0x80 | (chr & 0x3F));
} else if (chr < 0xD800 || chr > 0xDFFF) {
buf[j++] = (byte) (0xE0 | (chr >> 12));
buf[j++] = (byte) (0x80 | ((chr >> 6) & 0x3F));
buf[j++] = (byte) (0x80 | (chr & 0x3F));
} else {
// A surrogate pair. Confirm valid high surrogate.
if (chr < 0xDC00 && (i < end - 1)) {
int utf32 = (int) s.charAt(i + 1);
// Confirm valid low surrogate and write pair.
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utf32 = (chr << 10) + utf32 + SURROGATE_OFFSET;
i++;
buf[j++] = (byte) (0xF0 | (utf32 >> 18));
buf[j++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
buf[j++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
buf[j++] = (byte) (0x80 | (utf32 & 0x3F));
continue;
}
}
// Replace unpaired surrogate or out-of-order low surrogate
// with substitution character.
buf[j++] = (byte) 0xEF;
buf[j++] = (byte) 0xBF;
buf[j++] = (byte) 0xBD;
/** Writes a long string in chunks */
private void writeLongString(final String s) throws IOException {
final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length());
writeVInt(byteLen);
final byte[] buf =
new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)];
for (int i = 0, end = s.length(); i < end; ) {
// do one fewer chars than MAX_CHARS_PER_WINDOW in case we run into an unpaired surrogate
// below and need to increase the step to cover the lower surrogate as well
int step = Math.min(end - i, MAX_CHARS_PER_WINDOW - 1);
if (i + step < end && Character.isHighSurrogate(s.charAt(i + step - 1))) {
step++;
}
int upTo = UnicodeUtil.UTF16toUTF8(s, i, step, buf);
writeBytes(buf, 0, upTo);
i += step;
}
bufferFlusher.accept(j);
utf8Len += j;
return utf8Len;
}
}