mirror of https://github.com/apache/lucene.git
Clean up writing String to ByteBuffersDataOutput (#12455)
Resolving TODO to use UnicodeUtil instead of a copy of its code here. Maybe slightly slower from the extra check for high-surrogate but that may be outweigh or better by more compact code and saving the capturing lambda that might not inline.
This commit is contained in:
parent
87944c2aa7
commit
538b7d0ffe
|
@ -28,7 +28,6 @@ import java.util.Map;
|
|||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
import java.util.function.IntFunction;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
|
@ -410,25 +409,17 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
|
|||
}
|
||||
}
|
||||
|
||||
private static final int MAX_CHARS_PER_WINDOW = 1024;
|
||||
|
||||
@Override
|
||||
public void writeString(String v) {
|
||||
try {
|
||||
final int MAX_CHARS_PER_WINDOW = 1024;
|
||||
if (v.length() <= MAX_CHARS_PER_WINDOW) {
|
||||
final BytesRef utf8 = new BytesRef(v);
|
||||
writeVInt(utf8.length);
|
||||
writeBytes(utf8.bytes, utf8.offset, utf8.length);
|
||||
} else {
|
||||
writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
|
||||
final byte[] buf = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
|
||||
UTF16toUTF8(
|
||||
v,
|
||||
0,
|
||||
v.length(),
|
||||
buf,
|
||||
(len) -> {
|
||||
writeBytes(buf, 0, len);
|
||||
});
|
||||
writeLongString(v);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
|
@ -549,66 +540,22 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
|
|||
return blockBits;
|
||||
}
|
||||
|
||||
// TODO: move this block-based conversion to UnicodeUtil.
|
||||
|
||||
private static final long HALF_SHIFT = 10;
|
||||
private static final int SURROGATE_OFFSET =
|
||||
Character.MIN_SUPPLEMENTARY_CODE_POINT
|
||||
- (UnicodeUtil.UNI_SUR_HIGH_START << HALF_SHIFT)
|
||||
- UnicodeUtil.UNI_SUR_LOW_START;
|
||||
|
||||
/** A consumer-based UTF16-UTF8 encoder (writes the input string in smaller buffers.). */
|
||||
private static int UTF16toUTF8(
|
||||
final CharSequence s,
|
||||
final int offset,
|
||||
final int length,
|
||||
byte[] buf,
|
||||
IntConsumer bufferFlusher) {
|
||||
int utf8Len = 0;
|
||||
int j = 0;
|
||||
for (int i = offset, end = offset + length; i < end; i++) {
|
||||
final int chr = (int) s.charAt(i);
|
||||
|
||||
if (j + 4 >= buf.length) {
|
||||
bufferFlusher.accept(j);
|
||||
utf8Len += j;
|
||||
j = 0;
|
||||
}
|
||||
|
||||
if (chr < 0x80) buf[j++] = (byte) chr;
|
||||
else if (chr < 0x800) {
|
||||
buf[j++] = (byte) (0xC0 | (chr >> 6));
|
||||
buf[j++] = (byte) (0x80 | (chr & 0x3F));
|
||||
} else if (chr < 0xD800 || chr > 0xDFFF) {
|
||||
buf[j++] = (byte) (0xE0 | (chr >> 12));
|
||||
buf[j++] = (byte) (0x80 | ((chr >> 6) & 0x3F));
|
||||
buf[j++] = (byte) (0x80 | (chr & 0x3F));
|
||||
} else {
|
||||
// A surrogate pair. Confirm valid high surrogate.
|
||||
if (chr < 0xDC00 && (i < end - 1)) {
|
||||
int utf32 = (int) s.charAt(i + 1);
|
||||
// Confirm valid low surrogate and write pair.
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
utf32 = (chr << 10) + utf32 + SURROGATE_OFFSET;
|
||||
i++;
|
||||
buf[j++] = (byte) (0xF0 | (utf32 >> 18));
|
||||
buf[j++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
|
||||
buf[j++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
|
||||
buf[j++] = (byte) (0x80 | (utf32 & 0x3F));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Replace unpaired surrogate or out-of-order low surrogate
|
||||
// with substitution character.
|
||||
buf[j++] = (byte) 0xEF;
|
||||
buf[j++] = (byte) 0xBF;
|
||||
buf[j++] = (byte) 0xBD;
|
||||
/** Writes a long string in chunks */
|
||||
private void writeLongString(final String s) throws IOException {
|
||||
final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length());
|
||||
writeVInt(byteLen);
|
||||
final byte[] buf =
|
||||
new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)];
|
||||
for (int i = 0, end = s.length(); i < end; ) {
|
||||
// do one fewer chars than MAX_CHARS_PER_WINDOW in case we run into an unpaired surrogate
|
||||
// below and need to increase the step to cover the lower surrogate as well
|
||||
int step = Math.min(end - i, MAX_CHARS_PER_WINDOW - 1);
|
||||
if (i + step < end && Character.isHighSurrogate(s.charAt(i + step - 1))) {
|
||||
step++;
|
||||
}
|
||||
int upTo = UnicodeUtil.UTF16toUTF8(s, i, step, buf);
|
||||
writeBytes(buf, 0, upTo);
|
||||
i += step;
|
||||
}
|
||||
|
||||
bufferFlusher.accept(j);
|
||||
utf8Len += j;
|
||||
|
||||
return utf8Len;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue