Improve serialization (stream) of UTF strings, note, requires flush when upgrading, closes #1595.

2025-03-27 18:38:41 +00:00 · 2012-01-08 15:26:33 +02:00 · 2012-01-08 15:26:33 +02:00 · 7966716673
commit 7966716673
parent ef9c96faa6
5 changed files with 29 additions and 254 deletions
--- a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamInput.java
+++ b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamInput.java
@ -23,7 +23,6 @@ import org.elasticsearch.common.BytesHolder;

 import java.io.EOFException;
 import java.io.IOException;
-import java.io.UTFDataFormatException;

 /**
 *
@ -135,82 +134,4 @@ public class BytesStreamInput extends StreamInput {
    public void close() throws IOException {
        // nothing to do here...
    }
-
-    public String readUTF() throws IOException {
-        int utflen = readInt();
-        if (utflen == 0) {
-            return "";
-        }
-        if (chararr.length < utflen) {
-            chararr = new char[utflen * 2];
-        }
-        char[] chararr = this.chararr;
-        byte[] bytearr = buf;
-        int endPos = pos + utflen;
-
-        int c, char2, char3;
-        int count = pos;
-        int chararr_count = 0;
-
-        while (count < endPos) {
-            c = (int) bytearr[count] & 0xff;
-            if (c > 127) break;
-            count++;
-            chararr[chararr_count++] = (char) c;
-        }
-
-        while (count < endPos) {
-            c = (int) bytearr[count] & 0xff;
-            switch (c >> 4) {
-                case 0:
-                case 1:
-                case 2:
-                case 3:
-                case 4:
-                case 5:
-                case 6:
-                case 7:
-                    /* 0xxxxxxx*/
-                    count++;
-                    chararr[chararr_count++] = (char) c;
-                    break;
-                case 12:
-                case 13:
-                    /* 110x xxxx   10xx xxxx*/
-                    count += 2;
-                    if (count > endPos)
-                        throw new UTFDataFormatException(
-                                "malformed input: partial character at end");
-                    char2 = (int) bytearr[count - 1];
-                    if ((char2 & 0xC0) != 0x80)
-                        throw new UTFDataFormatException(
-                                "malformed input around byte " + count);
-                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
-                            (char2 & 0x3F));
-                    break;
-                case 14:
-                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
-                    count += 3;
-                    if (count > endPos)
-                        throw new UTFDataFormatException(
-                                "malformed input: partial character at end");
-                    char2 = (int) bytearr[count - 2];
-                    char3 = (int) bytearr[count - 1];
-                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
-                        throw new UTFDataFormatException(
-                                "malformed input around byte " + (count - 1));
-                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
-                            ((char2 & 0x3F) << 6) |
-                            ((char3 & 0x3F) << 0));
-                    break;
-                default:
-                    /* 10xx xxxx,  1111 xxxx */
-                    throw new UTFDataFormatException(
-                            "malformed input around byte " + count);
-            }
-        }
-        pos += utflen;
-        // The number of chars produced may be less than utflen
-        return new String(chararr, 0, chararr_count);
-    }
 }
--- a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java
+++ b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java
@ -118,62 +118,4 @@ public class BytesStreamOutput extends StreamOutput implements BytesStream {
    public int size() {
        return count;
    }
-
-
-    /**
-     * Writes a string.
-     */
-    // Override here since we can work on the byte array directly!
-    public void writeUTF(String str) throws IOException {
-        int strlen = str.length();
-        int utflen = 0;
-        int c = 0;
-
-        /* use charAt instead of copying String to char array */
-        for (int i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F)) {
-                utflen++;
-            } else if (c > 0x07FF) {
-                utflen += 3;
-            } else {
-                utflen += 2;
-            }
-        }
-
-        int newcount = count + utflen + 4;
-        if (newcount > buf.length) {
-            buf = Arrays.copyOf(buf, Math.max(buf.length << 1, newcount));
-        }
-
-        byte[] bytearr = this.buf;
-
-        // same as writeInt
-        bytearr[count++] = (byte) (utflen >> 24);
-        bytearr[count++] = (byte) (utflen >> 16);
-        bytearr[count++] = (byte) (utflen >> 8);
-        bytearr[count++] = (byte) (utflen);
-
-        int i = 0;
-        for (i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if (!((c >= 0x0001) && (c <= 0x007F))) break;
-            bytearr[count++] = (byte) c;
-        }
-
-        for (; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F)) {
-                bytearr[count++] = (byte) c;
-
-            } else if (c > 0x07FF) {
-                bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
-                bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
-                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
-            } else {
-                bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
-                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
-            }
-        }
-    }
 }
--- a/src/main/java/org/elasticsearch/common/io/stream/CachedStreamInput.java
+++ b/src/main/java/org/elasticsearch/common/io/stream/CachedStreamInput.java
@ -28,6 +28,7 @@ import java.lang.ref.SoftReference;
 public class CachedStreamInput {

    static class Entry {
+        char[] chars = new char[80];
        final HandlesStreamInput handles;
        final LZFStreamInput lzf;

@ -73,4 +74,12 @@ public class CachedStreamInput {
        entry.handles.reset(entry.lzf);
        return entry.handles;
    }
+
+    public static char[] getCharArray(int size) {
+        Entry entry = instance();
+        if (entry.chars.length < size) {
+            entry.chars = new char[size];
+        }
+        return entry.chars;
+    }
 }
--- a/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java
+++ b/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java
@ -24,7 +24,6 @@ import org.elasticsearch.common.Nullable;

 import java.io.IOException;
 import java.io.InputStream;
-import java.io.UTFDataFormatException;
 import java.util.*;

 /**
@ -32,12 +31,6 @@ import java.util.*;
 */
 public abstract class StreamInput extends InputStream {

-    /**
-     * working arrays initialized on demand by readUTF
-     */
-    private byte bytearr[] = new byte[80];
-    protected char chararr[] = new char[80];
-
    /**
     * Reads and returns a single byte.
     */
@ -154,35 +147,12 @@ public abstract class StreamInput extends InputStream {
        return i | ((b & 0x7FL) << 56);
    }

-    // COPIED from DataInputStream
-
    public String readUTF() throws IOException {
-        int utflen = readInt();
-        if (utflen == 0) {
-            return "";
-        }
-        if (bytearr.length < utflen) {
-            bytearr = new byte[utflen * 2];
-            chararr = new char[utflen * 2];
-        }
-        char[] chararr = this.chararr;
-        byte[] bytearr = this.bytearr;
-
-        int c, char2, char3;
-        int count = 0;
-        int chararr_count = 0;
-
-        readBytes(bytearr, 0, utflen);
-
-        while (count < utflen) {
-            c = (int) bytearr[count] & 0xff;
-            if (c > 127) break;
-            count++;
-            chararr[chararr_count++] = (char) c;
-        }
-
-        while (count < utflen) {
-            c = (int) bytearr[count] & 0xff;
+        int charCount = readVInt();
+        char[] chars = CachedStreamInput.getCharArray(charCount);
+        int c, charIndex = 0;
+        while (charIndex < charCount) {
+            c = readByte() & 0xff;
            switch (c >> 4) {
                case 0:
                case 1:
@ -192,47 +162,18 @@ public abstract class StreamInput extends InputStream {
                case 5:
                case 6:
                case 7:
-                    /* 0xxxxxxx*/
-                    count++;
-                    chararr[chararr_count++] = (char) c;
+                    chars[charIndex++] = (char) c;
                    break;
                case 12:
                case 13:
-                    /* 110x xxxx   10xx xxxx*/
-                    count += 2;
-                    if (count > utflen)
-                        throw new UTFDataFormatException(
-                                "malformed input: partial character at end");
-                    char2 = (int) bytearr[count - 1];
-                    if ((char2 & 0xC0) != 0x80)
-                        throw new UTFDataFormatException(
-                                "malformed input around byte " + count);
-                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
-                            (char2 & 0x3F));
+                    chars[charIndex++] = (char) ((c & 0x1F) << 6 | readByte() & 0x3F);
                    break;
                case 14:
-                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
-                    count += 3;
-                    if (count > utflen)
-                        throw new UTFDataFormatException(
-                                "malformed input: partial character at end");
-                    char2 = (int) bytearr[count - 2];
-                    char3 = (int) bytearr[count - 1];
-                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
-                        throw new UTFDataFormatException(
-                                "malformed input around byte " + (count - 1));
-                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
-                            ((char2 & 0x3F) << 6) |
-                            ((char3 & 0x3F) << 0));
+                    chars[charIndex++] = (char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0);
                    break;
-                default:
-                    /* 10xx xxxx,  1111 xxxx */
-                    throw new UTFDataFormatException(
-                            "malformed input around byte " + count);
            }
        }
-        // The number of chars produced may be less than utflen
-        return new String(chararr, 0, chararr_count);
+        return new String(chars, 0, charCount);
    }


--- a/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java
+++ b/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java
@ -33,11 +33,6 @@ import java.util.Map;
 */
 public abstract class StreamOutput extends OutputStream {

-    /**
-     * bytearr is initialized on demand by writeUTF
-     */
-    private byte[] bytearr = null;
-
    /**
     * Writes a single byte.
     */
@ -138,55 +133,22 @@ public abstract class StreamOutput extends OutputStream {
     * Writes a string.
     */
    public void writeUTF(String str) throws IOException {
-        int strlen = str.length();
-        int utflen = 0;
-        int c, count = 0;
-
-        /* use charAt instead of copying String to char array */
-        for (int i = 0; i < strlen; i++) {
+        int charCount = str.length();
+        writeVInt(charCount);
+        int c;
+        for (int i = 0; i < charCount; i++) {
            c = str.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F)) {
-                utflen++;
+            if (c <= 0x007F) {
+                writeByte((byte) c);
            } else if (c > 0x07FF) {
-                utflen += 3;
+                writeByte((byte) (0xE0 | c >> 12 & 0x0F));
+                writeByte((byte) (0x80 | c >> 6 & 0x3F));
+                writeByte((byte) (0x80 | c >> 0 & 0x3F));
            } else {
-                utflen += 2;
+                writeByte((byte) (0xC0 | c >> 6 & 0x1F));
+                writeByte((byte) (0x80 | c >> 0 & 0x3F));
            }
        }
-
-        if (this.bytearr == null || (this.bytearr.length < (utflen + 4)))
-            this.bytearr = new byte[(utflen * 2) + 4];
-        byte[] bytearr = this.bytearr;
-
-        // same as writeInt
-        bytearr[count++] = (byte) (utflen >> 24);
-        bytearr[count++] = (byte) (utflen >> 16);
-        bytearr[count++] = (byte) (utflen >> 8);
-        bytearr[count++] = (byte) (utflen);
-
-        int i = 0;
-        for (i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if (!((c >= 0x0001) && (c <= 0x007F))) break;
-            bytearr[count++] = (byte) c;
-        }
-
-        for (; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F)) {
-                bytearr[count++] = (byte) c;
-
-            } else if (c > 0x07FF) {
-                bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
-                bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
-                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
-            } else {
-                bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
-                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
-            }
-        }
-        writeBytes(bytearr, 0, utflen + 4);
-//        return utflen + 2;
    }

    public void writeFloat(float v) throws IOException {