Reduce allocations in ByteBuffersDataOutput.writeString (#13863)

There's no need to allocate a byte array when serializing to heap
buffers and the string fits the remaining capacity without further bounds checks.
If it doesn't fit we could technically do better than the current
`writeLongString` and avoid one round of copying by chunking the string
but that might not be worth the complexity.
In either case we can calculate the utf8 length up-front.
While this costs extra cycles (in the small case) for iterating the string twice it saves
creating an oftentimes 3x oversized byte array, a `BytesRef`, field
reads from the `BytesRef`, copying from it to the buffer and the associated GC with cleaning it up.
Theory and some quick benchmarking suggests this version is likely faster for any string
length than the existing code.
This commit is contained in:
Armin Braun 2024-10-09 09:09:00 +02:00 committed by GitHub
parent e4092396d5
commit 25850923db
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 11 additions and 9 deletions

View File

@ -31,7 +31,6 @@ import java.util.function.Consumer;
import java.util.function.IntFunction;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.UnicodeUtil;
@ -415,12 +414,17 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
@Override
public void writeString(String v) {
try {
if (v.length() <= MAX_CHARS_PER_WINDOW) {
final BytesRef utf8 = new BytesRef(v);
writeVInt(utf8.length);
writeBytes(utf8.bytes, utf8.offset, utf8.length);
final int charCount = v.length();
final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(v, 0, charCount);
writeVInt(byteLen);
ByteBuffer currentBlock = this.currentBlock;
if (currentBlock.hasArray() && currentBlock.remaining() >= byteLen) {
int startingPos = currentBlock.position();
UnicodeUtil.UTF16toUTF8(
v, 0, charCount, currentBlock.array(), currentBlock.arrayOffset() + startingPos);
currentBlock.position(startingPos + byteLen);
} else {
writeLongString(v);
writeLongString(byteLen, v);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
@ -541,9 +545,7 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
}
/** Writes a long string in chunks */
private void writeLongString(final String s) throws IOException {
final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length());
writeVInt(byteLen);
private void writeLongString(int byteLen, final String s) throws IOException {
final byte[] buf =
new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)];
for (int i = 0, end = s.length(); i < end; ) {