improve utf based stream input / output handling

2025-03-09 14:34:43 +00:00 · 2010-08-15 21:19:36 +03:00 · 2010-08-15 21:19:36 +03:00 · 57ee1bdc55
commit 57ee1bdc55
parent 14237317fc
5 changed files with 297 additions and 34 deletions
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/BytesStreamInput.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/BytesStreamInput.java
@ -19,10 +19,9 @@

 package org.elasticsearch.common.io.stream;

-import org.elasticsearch.common.Unicode;
-
 import java.io.EOFException;
 import java.io.IOException;
+import java.io.UTFDataFormatException;

 /**
 * @author kimchy (shay.banon)
@ -81,6 +80,9 @@ public class BytesStreamInput extends StreamInput {
    }

    @Override public void readBytes(byte[] b, int offset, int len) throws IOException {
+        if (len == 0) {
+            return;
+        }
        if (pos >= count) {
            throw new EOFException();
        }
@ -94,16 +96,6 @@ public class BytesStreamInput extends StreamInput {
        pos += len;
    }

-    @Override public String readUTF() throws IOException {
-        int length = readVInt();
-        if (pos + length > count) {
-            throw new EOFException();
-        }
-        String str = Unicode.fromBytes(buf, pos, length);
-        pos += length;
-        return str;
-    }
-
    @Override public void reset() throws IOException {
        pos = 0;
    }
@ -111,4 +103,82 @@ public class BytesStreamInput extends StreamInput {
    @Override public void close() throws IOException {
        // nothing to do here...
    }
+
+    public String readUTF() throws IOException {
+        int utflen = readUnsignedShort();
+        if (utflen == 0) {
+            return "";
+        }
+        if (chararr.length < utflen) {
+            chararr = new char[utflen * 2];
+        }
+        char[] chararr = this.chararr;
+        byte[] bytearr = buf;
+        int endPos = pos + utflen;
+
+        int c, char2, char3;
+        int count = pos;
+        int chararr_count = 0;
+
+        while (count < endPos) {
+            c = (int) bytearr[count] & 0xff;
+            if (c > 127) break;
+            count++;
+            chararr[chararr_count++] = (char) c;
+        }
+
+        while (count < endPos) {
+            c = (int) bytearr[count] & 0xff;
+            switch (c >> 4) {
+                case 0:
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                case 7:
+                    /* 0xxxxxxx*/
+                    count++;
+                    chararr[chararr_count++] = (char) c;
+                    break;
+                case 12:
+                case 13:
+                    /* 110x xxxx   10xx xxxx*/
+                    count += 2;
+                    if (count > endPos)
+                        throw new UTFDataFormatException(
+                                "malformed input: partial character at end");
+                    char2 = (int) bytearr[count - 1];
+                    if ((char2 & 0xC0) != 0x80)
+                        throw new UTFDataFormatException(
+                                "malformed input around byte " + count);
+                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
+                            (char2 & 0x3F));
+                    break;
+                case 14:
+                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                    count += 3;
+                    if (count > endPos)
+                        throw new UTFDataFormatException(
+                                "malformed input: partial character at end");
+                    char2 = (int) bytearr[count - 2];
+                    char3 = (int) bytearr[count - 1];
+                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                        throw new UTFDataFormatException(
+                                "malformed input around byte " + (count - 1));
+                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
+                            ((char2 & 0x3F) << 6) |
+                            ((char3 & 0x3F) << 0));
+                    break;
+                default:
+                    /* 10xx xxxx,  1111 xxxx */
+                    throw new UTFDataFormatException(
+                            "malformed input around byte " + count);
+            }
+        }
+        pos += utflen;
+        // The number of chars produced may be less than utflen
+        return new String(chararr, 0, chararr_count);
+    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java
@ -20,6 +20,7 @@
 package org.elasticsearch.common.io.stream;

 import java.io.IOException;
+import java.io.UTFDataFormatException;
 import java.util.Arrays;

 /**
@ -108,4 +109,63 @@ public class BytesStreamOutput extends StreamOutput {
    public int size() {
        return count;
    }
+
+
+    /**
+     * Writes a string.
+     */
+    // Override here since we can work on the byte array directly!
+    public void writeUTF(String str) throws IOException {
+        int strlen = str.length();
+        int utflen = 0;
+        int c = 0;
+
+        /* use charAt instead of copying String to char array */
+        for (int i = 0; i < strlen; i++) {
+            c = str.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+                utflen++;
+            } else if (c > 0x07FF) {
+                utflen += 3;
+            } else {
+                utflen += 2;
+            }
+        }
+
+        if (utflen > 65535)
+            throw new UTFDataFormatException(
+                    "encoded string too long: " + utflen + " bytes");
+
+        int newcount = count + utflen + 2;
+        if (newcount > buf.length) {
+            buf = Arrays.copyOf(buf, Math.max(buf.length << 1, newcount));
+        }
+
+        byte[] bytearr = this.buf;
+
+        bytearr[count++] = (byte) ((utflen >>> 8) & 0xFF);
+        bytearr[count++] = (byte) ((utflen >>> 0) & 0xFF);
+
+        int i = 0;
+        for (i = 0; i < strlen; i++) {
+            c = str.charAt(i);
+            if (!((c >= 0x0001) && (c <= 0x007F))) break;
+            bytearr[count++] = (byte) c;
+        }
+
+        for (; i < strlen; i++) {
+            c = str.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+                bytearr[count++] = (byte) c;
+
+            } else if (c > 0x07FF) {
+                bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
+                bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
+                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+            } else {
+                bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
+                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+            }
+        }
+    }
 }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java
@ -19,18 +19,22 @@

 package org.elasticsearch.common.io.stream;

-import org.elasticsearch.common.Bytes;
-import org.elasticsearch.common.Unicode;
-
 import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UTFDataFormatException;

 /**
 * @author kimchy (shay.banon)
 */
 public abstract class StreamInput extends InputStream {

+    /**
+     * working arrays initialized on demand by readUTF
+     */
+    private byte bytearr[] = new byte[80];
+    protected char chararr[] = new char[80];
+
    /**
     * Reads and returns a single byte.
     */
@ -98,20 +102,96 @@ public abstract class StreamInput extends InputStream {
        return i;
    }

-    /**
-     * Reads a string.
-     */
-    public String readUTF() throws IOException {
-        int length = readVInt();
-        byte[] bytes = Bytes.cachedBytes.get().get();
-        if (bytes == null || length > bytes.length) {
-            bytes = new byte[(int) (length * 1.25)];
-            Bytes.cachedBytes.get().set(bytes);
-        }
-        readBytes(bytes, 0, length);
-        return Unicode.fromBytes(bytes, 0, length);
+    protected final int readUnsignedShort() throws IOException {
+        int ch1 = read();
+        int ch2 = read();
+        if ((ch1 | ch2) < 0)
+            throw new EOFException();
+        return (ch1 << 8) + (ch2 << 0);
    }

+    // COPIED from DataInputStream
+
+    public String readUTF() throws IOException {
+        int utflen = readUnsignedShort();
+        if (utflen == 0) {
+            return "";
+        }
+        if (bytearr.length < utflen) {
+            bytearr = new byte[utflen * 2];
+            chararr = new char[utflen * 2];
+        }
+        char[] chararr = this.chararr;
+        byte[] bytearr = this.bytearr;
+
+        int c, char2, char3;
+        int count = 0;
+        int chararr_count = 0;
+
+        readBytes(bytearr, 0, utflen);
+
+        while (count < utflen) {
+            c = (int) bytearr[count] & 0xff;
+            if (c > 127) break;
+            count++;
+            chararr[chararr_count++] = (char) c;
+        }
+
+        while (count < utflen) {
+            c = (int) bytearr[count] & 0xff;
+            switch (c >> 4) {
+                case 0:
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                case 7:
+                    /* 0xxxxxxx*/
+                    count++;
+                    chararr[chararr_count++] = (char) c;
+                    break;
+                case 12:
+                case 13:
+                    /* 110x xxxx   10xx xxxx*/
+                    count += 2;
+                    if (count > utflen)
+                        throw new UTFDataFormatException(
+                                "malformed input: partial character at end");
+                    char2 = (int) bytearr[count - 1];
+                    if ((char2 & 0xC0) != 0x80)
+                        throw new UTFDataFormatException(
+                                "malformed input around byte " + count);
+                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
+                            (char2 & 0x3F));
+                    break;
+                case 14:
+                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                    count += 3;
+                    if (count > utflen)
+                        throw new UTFDataFormatException(
+                                "malformed input: partial character at end");
+                    char2 = (int) bytearr[count - 2];
+                    char3 = (int) bytearr[count - 1];
+                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                        throw new UTFDataFormatException(
+                                "malformed input around byte " + (count - 1));
+                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
+                            ((char2 & 0x3F) << 6) |
+                            ((char3 & 0x3F) << 0));
+                    break;
+                default:
+                    /* 10xx xxxx,  1111 xxxx */
+                    throw new UTFDataFormatException(
+                            "malformed input around byte " + count);
+            }
+        }
+        // The number of chars produced may be less than utflen
+        return new String(chararr, 0, chararr_count);
+    }
+
+
    public final float readFloat() throws IOException {
        return Float.intBitsToFloat(readInt());
    }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java
@ -19,17 +19,20 @@

 package org.elasticsearch.common.io.stream;

-import org.apache.lucene.util.UnicodeUtil;
-import org.elasticsearch.common.Unicode;
-
 import java.io.IOException;
 import java.io.OutputStream;
+import java.io.UTFDataFormatException;

 /**
 * @author kimchy (shay.banon)
 */
 public abstract class StreamOutput extends OutputStream {

+    /**
+     * bytearr is initialized on demand by writeUTF
+     */
+    private byte[] bytearr = null;
+
    /**
     * Writes a single byte.
     */
@ -115,10 +118,57 @@ public abstract class StreamOutput extends OutputStream {
    /**
     * Writes a string.
     */
-    public void writeUTF(String s) throws IOException {
-        UnicodeUtil.UTF8Result utf8Result = Unicode.unsafeFromStringAsUtf8(s);
-        writeVInt(utf8Result.length);
-        writeBytes(utf8Result.result, 0, utf8Result.length);
+    public void writeUTF(String str) throws IOException {
+        int strlen = str.length();
+        int utflen = 0;
+        int c, count = 0;
+
+        /* use charAt instead of copying String to char array */
+        for (int i = 0; i < strlen; i++) {
+            c = str.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+                utflen++;
+            } else if (c > 0x07FF) {
+                utflen += 3;
+            } else {
+                utflen += 2;
+            }
+        }
+
+        if (utflen > 65535)
+            throw new UTFDataFormatException(
+                    "encoded string too long: " + utflen + " bytes");
+
+        if (this.bytearr == null || (this.bytearr.length < (utflen + 2)))
+            this.bytearr = new byte[(utflen * 2) + 2];
+        byte[] bytearr = this.bytearr;
+
+        bytearr[count++] = (byte) ((utflen >>> 8) & 0xFF);
+        bytearr[count++] = (byte) ((utflen >>> 0) & 0xFF);
+
+        int i = 0;
+        for (i = 0; i < strlen; i++) {
+            c = str.charAt(i);
+            if (!((c >= 0x0001) && (c <= 0x007F))) break;
+            bytearr[count++] = (byte) c;
+        }
+
+        for (; i < strlen; i++) {
+            c = str.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+                bytearr[count++] = (byte) c;
+
+            } else if (c > 0x07FF) {
+                bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
+                bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
+                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+            } else {
+                bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
+                bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+            }
+        }
+        writeBytes(bytearr, 0, utflen + 2);
+//        return utflen + 2;
    }

    public void writeFloat(float v) throws IOException {
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/transport/netty/ChannelBufferStreamInput.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/transport/netty/ChannelBufferStreamInput.java
@ -74,6 +74,9 @@ public class ChannelBufferStreamInput extends StreamInput {

    @Override
    public int read(byte[] b, int off, int len) throws IOException {
+        if (len == 0) {
+            return 0;
+        }
        int available = available();
        if (available == 0) {
            return -1;