SOLR-2034: switch JavaBin over to standard UTF-8

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@990180 13f79535-47bb-0310-9956-ffa450edef68
2010-08-27 15:40:17 +00:00 · 2010-08-27 15:40:17 +00:00 · bbfbc428ca
parent 949e1d7150
commit bbfbc428ca
4 changed files with 123 additions and 68 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -39,6 +39,9 @@ Upgrading from Solr 1.4
  before the master.  If the master were to be updated first, the older
  searchers would not be able to read the new index format.

+* The Solr JavaBin format has changed as of Solr 3.1. If you are using the 
+  JavaBin format, you will need to upgrade your SolrJ client. (SOLR-2034)
+
 * The experimental ALIAS command has been removed (SOLR-1637)

 * Using solr.xml is recommended for single cores also (SOLR-1621)
@ -532,6 +535,11 @@ Other Changes

 * SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)  

+* SOLR-2034: Switch to JavaBin codec version 2. Strings are now serialized
+  as the number of UTF-8 bytes, followed by the bytes in UTF-8. Previously
+  Strings were serialized as the number of UTF-16 chars, followed by the
+  bytes in Modified UTF-8. (hossman, yonik, rmuir)
+
 Build
 ----------------------

--- a/solr/src/common/org/apache/solr/common/util/JavaBinCodec.java
+++ b/solr/src/common/org/apache/solr/common/util/JavaBinCodec.java
@ -69,7 +69,7 @@ public class JavaBinCodec {
          EXTERN_STRING = (byte) (7 << 5);


-  private static byte VERSION = 1;
+  private static byte VERSION = 2;
  private ObjectResolver resolver;
  protected FastOutputStream daos;

@ -416,31 +416,90 @@ public class JavaBinCodec {
  }

  /**
-   * write the string as tag+length, with length being the number of UTF-16 characters, followed by the string encoded
-   * in modified-UTF8
+   * write the string as tag+length, with length being the number of UTF-8 bytes
   */
  public void writeStr(String s) throws IOException {
    if (s == null) {
      writeTag(NULL);
      return;
    }
-    // Can't use string serialization or toUTF()... it's limited to 64K
-    // plus it's bigger than it needs to be for small strings anyway
-    int len = s.length();
-    writeTag(STR, len);
-    writeChars(daos, s, 0, len);
+    int end = s.length();
+    int maxSize = end * 4;
+    if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
+    int upto = 0;
+    for(int i=0;i<end;i++) {
+      final int code = (int) s.charAt(i);
+
+      if (code < 0x80)
+        bytes[upto++] = (byte) code;
+      else if (code < 0x800) {
+        bytes[upto++] = (byte) (0xC0 | (code >> 6));
+        bytes[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        bytes[upto++] = (byte)(0xE0 | (code >> 12));
+        bytes[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+        bytes[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && (i < end-1)) {
+          int utf32 = (int) s.charAt(i+1);
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { 
+            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+            i++;
+            bytes[upto++] = (byte)(0xF0 | (utf32 >> 18));
+            bytes[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+            bytes[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+            bytes[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+            continue;
+          }
+        }
+        // replace unpaired surrogate or out-of-order low surrogate
+        // with substitution character
+        bytes[upto++] = (byte) 0xEF;
+        bytes[upto++] = (byte) 0xBF;
+        bytes[upto++] = (byte) 0xBD;
+      }
+    }
+    writeTag(STR, upto);
+    daos.write(bytes, 0, upto);
  }

-
-  char[] charArr;
+  byte[] bytes;
+  char[] chars;

  public String readStr(FastInputStream dis) throws IOException {
    int sz = readSize(dis);
-    if (charArr == null || charArr.length < sz) {
-      charArr = new char[sz];
+    if (chars == null || chars.length < sz) chars = new char[sz];
+    if (bytes == null || bytes.length < sz) bytes = new byte[sz];
+    dis.readFully(bytes, 0, sz);
+    int outUpto=0;
+    for (int i = 0; i < sz;) {
+      final int b = bytes[i++]&0xff;
+      final int ch;
+      if (b < 0xc0) {
+        assert b < 0x80;
+        ch = b;
+      } else if (b < 0xe0) {
+        ch = ((b&0x1f)<<6) + (bytes[i++]&0x3f);
+      } else if (b < 0xf0) {
+        ch = ((b&0xf)<<12) + ((bytes[i++]&0x3f)<<6) + (bytes[i++]&0x3f);
+      } else {
+        assert b < 0xf8;
+        ch = ((b&0x7)<<18) + ((bytes[i++]&0x3f)<<12) + ((bytes[i++]&0x3f)<<6) + (bytes[i++]&0x3f);
+      }
+      if (ch <= 0xFFFF) {
+        // target is a character <= 0xFFFF
+        chars[outUpto++] = (char) ch;
+      } else {
+        // target is a character in range 0xFFFF - 0x10FFFF
+        final int chHalf = ch - 0x10000;
+        chars[outUpto++] = (char) ((chHalf >> 0xA) + 0xD800);
+        chars[outUpto++] = (char) ((chHalf & 0x3FF) + 0xDC00);
+      }
    }
-    readChars(dis, charArr, 0, sz);
-    return new String(charArr, 0, sz);
+    return new String(chars, 0, outUpto);
  }

  public void writeInt(int val) throws IOException {
@ -622,59 +681,6 @@ public class JavaBinCodec {
    return i;
  }

-  /**
-   * Writes a sequence of UTF-8 encoded characters from a string.
-   *
-   * @param s      the source of the characters
-   * @param start  the first character in the sequence
-   * @param length the number of characters in the sequence
-   *
-   * @see org.apache.lucene.store.IndexInput#readChars(char[],int,int)
-   */
-  public static void writeChars(FastOutputStream os, String s, int start, int length)
-          throws IOException {
-    final int end = start + length;
-    for (int i = start; i < end; i++) {
-      final int code = (int) s.charAt(i);
-      if (code >= 0x01 && code <= 0x7F)
-        os.write(code);
-      else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
-        os.write(0xC0 | (code >> 6));
-        os.write(0x80 | (code & 0x3F));
-      } else {
-        os.write(0xE0 | (code >>> 12));
-        os.write(0x80 | ((code >> 6) & 0x3F));
-        os.write(0x80 | (code & 0x3F));
-      }
-    }
-  }
-
-  /**
-   * Reads UTF-8 encoded characters into an array.
-   *
-   * @param buffer the array to read characters into
-   * @param start  the offset in the array to start storing characters
-   * @param length the number of characters to read
-   *
-   * @see org.apache.lucene.store.IndexOutput#writeChars(String,int,int)
-   */
-  public static void readChars(FastInputStream in, char[] buffer, int start, int length)
-          throws IOException {
-    final int end = start + length;
-    for (int i = start; i < end; i++) {
-      int b = in.read();
-      if ((b & 0x80) == 0)
-        buffer[i] = (char) b;
-      else if ((b & 0xE0) != 0xE0) {
-        buffer[i] = (char) (((b & 0x1F) << 6)
-                | (in.read() & 0x3F));
-      } else
-        buffer[i] = (char) (((b & 0x0F) << 12)
-                | ((in.read() & 0x3F) << 6)
-                | (in.read() & 0x3F));
-    }
-  }
-
  private int stringsCount = 0;
  private Map<String, Integer> stringsMap;
  private List<String> stringsList;
--- a/solr/src/solrj/org/apache/solr/client/solrj/impl/BinaryResponseParser.java
+++ b/solr/src/solrj/org/apache/solr/client/solrj/impl/BinaryResponseParser.java
@ -45,7 +45,7 @@ public class BinaryResponseParser extends ResponseParser {


  public String getVersion() {
-    return "1";
+    return "2";
  }

  public NamedList<Object> processResponse(Reader reader) {
--- a/solr/src/test/org/apache/solr/common/util/TestJavaBinCodec.java
+++ b/solr/src/test/org/apache/solr/common/util/TestJavaBinCodec.java
@ -0,0 +1,41 @@
+package org.apache.solr.common.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.util.Random;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+public class TestJavaBinCodec extends LuceneTestCase {
+  
+ public void testStrings() throws Exception {
+    Random r = newRandom();
+    JavaBinCodec javabin = new JavaBinCodec();
+    for (int i = 0; i < 10000*RANDOM_MULTIPLIER; i++) {
+      String s = _TestUtil.randomUnicodeString(r);
+      ByteArrayOutputStream os = new ByteArrayOutputStream();
+      javabin.marshal(s, os);
+      ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray());
+      Object o = javabin.unmarshal(is);
+      assertEquals(s, o);
+    }
+  }
+}