Reduce allocations in ByteBuffersDataOutput.writeString (#13863)

There's no need to allocate a byte array when serializing to heap buffers and the string fits the remaining capacity without further bounds checks. If it doesn't fit we could technically do better than the current `writeLongString` and avoid one round of copying by chunking the string but that might not be worth the complexity. In either case we can calculate the utf8 length up-front. While this costs extra cycles (in the small case) for iterating the string twice it saves creating an oftentimes 3x oversized byte array, a `BytesRef`, field reads from the `BytesRef`, copying from it to the buffer and the associated GC with cleaning it up. Theory and some quick benchmarking suggests this version is likely faster for any string length than the existing code.
2024-10-09 09:09:00 +02:00 · 2024-10-09 09:09:00 +02:00 · 25850923db
parent e4092396d5
commit 25850923db
1 changed files with 11 additions and 9 deletions
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java
@ -31,7 +31,6 @@ import java.util.function.Consumer;
 import java.util.function.IntFunction;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BitUtil;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.UnicodeUtil;

@ -415,12 +414,17 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
  @Override
  public void writeString(String v) {
    try {
-      if (v.length() <= MAX_CHARS_PER_WINDOW) {
-        final BytesRef utf8 = new BytesRef(v);
-        writeVInt(utf8.length);
-        writeBytes(utf8.bytes, utf8.offset, utf8.length);
+      final int charCount = v.length();
+      final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(v, 0, charCount);
+      writeVInt(byteLen);
+      ByteBuffer currentBlock = this.currentBlock;
+      if (currentBlock.hasArray() && currentBlock.remaining() >= byteLen) {
+        int startingPos = currentBlock.position();
+        UnicodeUtil.UTF16toUTF8(
+            v, 0, charCount, currentBlock.array(), currentBlock.arrayOffset() + startingPos);
+        currentBlock.position(startingPos + byteLen);
      } else {
-        writeLongString(v);
+        writeLongString(byteLen, v);
      }
    } catch (IOException e) {
      throw new UncheckedIOException(e);
@ -541,9 +545,7 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
  }

  /** Writes a long string in chunks */
-  private void writeLongString(final String s) throws IOException {
-    final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length());
-    writeVInt(byteLen);
+  private void writeLongString(int byteLen, final String s) throws IOException {
    final byte[] buf =
        new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)];
    for (int i = 0, end = s.length(); i < end; ) {