Clean up writing String to ByteBuffersDataOutput (#12455)

Resolving TODO to use UnicodeUtil instead of a copy of its code here. Maybe slightly slower from the extra check for high-surrogate but that may be outweigh or better by more compact code and saving the capturing lambda that might not inline.
2023-07-26 14:25:01 +02:00 · 2023-07-26 14:25:01 +02:00 · 538b7d0ffe
parent 87944c2aa7
commit 538b7d0ffe
1 changed files with 18 additions and 71 deletions
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java
@ -28,7 +28,6 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.function.Consumer;
-import java.util.function.IntConsumer;
 import java.util.function.IntFunction;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BitUtil;
@ -410,25 +409,17 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
    }
  }

+  private static final int MAX_CHARS_PER_WINDOW = 1024;
+
  @Override
  public void writeString(String v) {
    try {
-      final int MAX_CHARS_PER_WINDOW = 1024;
      if (v.length() <= MAX_CHARS_PER_WINDOW) {
        final BytesRef utf8 = new BytesRef(v);
        writeVInt(utf8.length);
        writeBytes(utf8.bytes, utf8.offset, utf8.length);
      } else {
-        writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
-        final byte[] buf = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
-        UTF16toUTF8(
-            v,
-            0,
-            v.length(),
-            buf,
-            (len) -> {
-              writeBytes(buf, 0, len);
-            });
+        writeLongString(v);
      }
    } catch (IOException e) {
      throw new UncheckedIOException(e);
@ -549,66 +540,22 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab
    return blockBits;
  }

-  // TODO: move this block-based conversion to UnicodeUtil.
-
-  private static final long HALF_SHIFT = 10;
-  private static final int SURROGATE_OFFSET =
-      Character.MIN_SUPPLEMENTARY_CODE_POINT
-          - (UnicodeUtil.UNI_SUR_HIGH_START << HALF_SHIFT)
-          - UnicodeUtil.UNI_SUR_LOW_START;
-
-  /** A consumer-based UTF16-UTF8 encoder (writes the input string in smaller buffers.). */
-  private static int UTF16toUTF8(
-      final CharSequence s,
-      final int offset,
-      final int length,
-      byte[] buf,
-      IntConsumer bufferFlusher) {
-    int utf8Len = 0;
-    int j = 0;
-    for (int i = offset, end = offset + length; i < end; i++) {
-      final int chr = (int) s.charAt(i);
-
-      if (j + 4 >= buf.length) {
-        bufferFlusher.accept(j);
-        utf8Len += j;
-        j = 0;
-      }
-
-      if (chr < 0x80) buf[j++] = (byte) chr;
-      else if (chr < 0x800) {
-        buf[j++] = (byte) (0xC0 | (chr >> 6));
-        buf[j++] = (byte) (0x80 | (chr & 0x3F));
-      } else if (chr < 0xD800 || chr > 0xDFFF) {
-        buf[j++] = (byte) (0xE0 | (chr >> 12));
-        buf[j++] = (byte) (0x80 | ((chr >> 6) & 0x3F));
-        buf[j++] = (byte) (0x80 | (chr & 0x3F));
-      } else {
-        // A surrogate pair. Confirm valid high surrogate.
-        if (chr < 0xDC00 && (i < end - 1)) {
-          int utf32 = (int) s.charAt(i + 1);
-          // Confirm valid low surrogate and write pair.
-          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
-            utf32 = (chr << 10) + utf32 + SURROGATE_OFFSET;
-            i++;
-            buf[j++] = (byte) (0xF0 | (utf32 >> 18));
-            buf[j++] = (byte) (0x80 | ((utf32 >> 12) & 0x3F));
-            buf[j++] = (byte) (0x80 | ((utf32 >> 6) & 0x3F));
-            buf[j++] = (byte) (0x80 | (utf32 & 0x3F));
-            continue;
-          }
-        }
-        // Replace unpaired surrogate or out-of-order low surrogate
-        // with substitution character.
-        buf[j++] = (byte) 0xEF;
-        buf[j++] = (byte) 0xBF;
-        buf[j++] = (byte) 0xBD;
+  /** Writes a long string in chunks */
+  private void writeLongString(final String s) throws IOException {
+    final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length());
+    writeVInt(byteLen);
+    final byte[] buf =
+        new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)];
+    for (int i = 0, end = s.length(); i < end; ) {
+      // do one fewer chars than MAX_CHARS_PER_WINDOW in case we run into an unpaired surrogate
+      // below and need to increase the step to cover the lower surrogate as well
+      int step = Math.min(end - i, MAX_CHARS_PER_WINDOW - 1);
+      if (i + step < end && Character.isHighSurrogate(s.charAt(i + step - 1))) {
+        step++;
      }
+      int upTo = UnicodeUtil.UTF16toUTF8(s, i, step, buf);
+      writeBytes(buf, 0, upTo);
+      i += step;
    }
-
-    bufferFlusher.accept(j);
-    utf8Len += j;
-
-    return utf8Len;
  }
 }