mirror of https://github.com/apache/lucene.git
SOLR-2034: switch JavaBin over to standard UTF-8
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@990180 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
949e1d7150
commit
bbfbc428ca
|
@ -39,6 +39,9 @@ Upgrading from Solr 1.4
|
|||
before the master. If the master were to be updated first, the older
|
||||
searchers would not be able to read the new index format.
|
||||
|
||||
* The Solr JavaBin format has changed as of Solr 3.1. If you are using the
|
||||
JavaBin format, you will need to upgrade your SolrJ client. (SOLR-2034)
|
||||
|
||||
* The experimental ALIAS command has been removed (SOLR-1637)
|
||||
|
||||
* Using solr.xml is recommended for single cores also (SOLR-1621)
|
||||
|
@ -532,6 +535,11 @@ Other Changes
|
|||
|
||||
* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)
|
||||
|
||||
* SOLR-2034: Switch to JavaBin codec version 2. Strings are now serialized
|
||||
as the number of UTF-8 bytes, followed by the bytes in UTF-8. Previously
|
||||
Strings were serialized as the number of UTF-16 chars, followed by the
|
||||
bytes in Modified UTF-8. (hossman, yonik, rmuir)
|
||||
|
||||
Build
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ public class JavaBinCodec {
|
|||
EXTERN_STRING = (byte) (7 << 5);
|
||||
|
||||
|
||||
private static byte VERSION = 1;
|
||||
private static byte VERSION = 2;
|
||||
private ObjectResolver resolver;
|
||||
protected FastOutputStream daos;
|
||||
|
||||
|
@ -416,31 +416,90 @@ public class JavaBinCodec {
|
|||
}
|
||||
|
||||
/**
|
||||
* write the string as tag+length, with length being the number of UTF-16 characters, followed by the string encoded
|
||||
* in modified-UTF8
|
||||
* write the string as tag+length, with length being the number of UTF-8 bytes
|
||||
*/
|
||||
public void writeStr(String s) throws IOException {
|
||||
if (s == null) {
|
||||
writeTag(NULL);
|
||||
return;
|
||||
}
|
||||
// Can't use string serialization or toUTF()... it's limited to 64K
|
||||
// plus it's bigger than it needs to be for small strings anyway
|
||||
int len = s.length();
|
||||
writeTag(STR, len);
|
||||
writeChars(daos, s, 0, len);
|
||||
int end = s.length();
|
||||
int maxSize = end * 4;
|
||||
if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
|
||||
int upto = 0;
|
||||
for(int i=0;i<end;i++) {
|
||||
final int code = (int) s.charAt(i);
|
||||
|
||||
if (code < 0x80)
|
||||
bytes[upto++] = (byte) code;
|
||||
else if (code < 0x800) {
|
||||
bytes[upto++] = (byte) (0xC0 | (code >> 6));
|
||||
bytes[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||
bytes[upto++] = (byte)(0xE0 | (code >> 12));
|
||||
bytes[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
|
||||
bytes[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else {
|
||||
// surrogate pair
|
||||
// confirm valid high surrogate
|
||||
if (code < 0xDC00 && (i < end-1)) {
|
||||
int utf32 = (int) s.charAt(i+1);
|
||||
// confirm valid low surrogate and write pair
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
|
||||
i++;
|
||||
bytes[upto++] = (byte)(0xF0 | (utf32 >> 18));
|
||||
bytes[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
|
||||
bytes[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
|
||||
bytes[upto++] = (byte)(0x80 | (utf32 & 0x3F));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// replace unpaired surrogate or out-of-order low surrogate
|
||||
// with substitution character
|
||||
bytes[upto++] = (byte) 0xEF;
|
||||
bytes[upto++] = (byte) 0xBF;
|
||||
bytes[upto++] = (byte) 0xBD;
|
||||
}
|
||||
}
|
||||
writeTag(STR, upto);
|
||||
daos.write(bytes, 0, upto);
|
||||
}
|
||||
|
||||
|
||||
char[] charArr;
|
||||
byte[] bytes;
|
||||
char[] chars;
|
||||
|
||||
public String readStr(FastInputStream dis) throws IOException {
|
||||
int sz = readSize(dis);
|
||||
if (charArr == null || charArr.length < sz) {
|
||||
charArr = new char[sz];
|
||||
if (chars == null || chars.length < sz) chars = new char[sz];
|
||||
if (bytes == null || bytes.length < sz) bytes = new byte[sz];
|
||||
dis.readFully(bytes, 0, sz);
|
||||
int outUpto=0;
|
||||
for (int i = 0; i < sz;) {
|
||||
final int b = bytes[i++]&0xff;
|
||||
final int ch;
|
||||
if (b < 0xc0) {
|
||||
assert b < 0x80;
|
||||
ch = b;
|
||||
} else if (b < 0xe0) {
|
||||
ch = ((b&0x1f)<<6) + (bytes[i++]&0x3f);
|
||||
} else if (b < 0xf0) {
|
||||
ch = ((b&0xf)<<12) + ((bytes[i++]&0x3f)<<6) + (bytes[i++]&0x3f);
|
||||
} else {
|
||||
assert b < 0xf8;
|
||||
ch = ((b&0x7)<<18) + ((bytes[i++]&0x3f)<<12) + ((bytes[i++]&0x3f)<<6) + (bytes[i++]&0x3f);
|
||||
}
|
||||
if (ch <= 0xFFFF) {
|
||||
// target is a character <= 0xFFFF
|
||||
chars[outUpto++] = (char) ch;
|
||||
} else {
|
||||
// target is a character in range 0xFFFF - 0x10FFFF
|
||||
final int chHalf = ch - 0x10000;
|
||||
chars[outUpto++] = (char) ((chHalf >> 0xA) + 0xD800);
|
||||
chars[outUpto++] = (char) ((chHalf & 0x3FF) + 0xDC00);
|
||||
}
|
||||
}
|
||||
readChars(dis, charArr, 0, sz);
|
||||
return new String(charArr, 0, sz);
|
||||
return new String(chars, 0, outUpto);
|
||||
}
|
||||
|
||||
public void writeInt(int val) throws IOException {
|
||||
|
@ -622,59 +681,6 @@ public class JavaBinCodec {
|
|||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a sequence of UTF-8 encoded characters from a string.
|
||||
*
|
||||
* @param s the source of the characters
|
||||
* @param start the first character in the sequence
|
||||
* @param length the number of characters in the sequence
|
||||
*
|
||||
* @see org.apache.lucene.store.IndexInput#readChars(char[],int,int)
|
||||
*/
|
||||
public static void writeChars(FastOutputStream os, String s, int start, int length)
|
||||
throws IOException {
|
||||
final int end = start + length;
|
||||
for (int i = start; i < end; i++) {
|
||||
final int code = (int) s.charAt(i);
|
||||
if (code >= 0x01 && code <= 0x7F)
|
||||
os.write(code);
|
||||
else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
|
||||
os.write(0xC0 | (code >> 6));
|
||||
os.write(0x80 | (code & 0x3F));
|
||||
} else {
|
||||
os.write(0xE0 | (code >>> 12));
|
||||
os.write(0x80 | ((code >> 6) & 0x3F));
|
||||
os.write(0x80 | (code & 0x3F));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads UTF-8 encoded characters into an array.
|
||||
*
|
||||
* @param buffer the array to read characters into
|
||||
* @param start the offset in the array to start storing characters
|
||||
* @param length the number of characters to read
|
||||
*
|
||||
* @see org.apache.lucene.store.IndexOutput#writeChars(String,int,int)
|
||||
*/
|
||||
public static void readChars(FastInputStream in, char[] buffer, int start, int length)
|
||||
throws IOException {
|
||||
final int end = start + length;
|
||||
for (int i = start; i < end; i++) {
|
||||
int b = in.read();
|
||||
if ((b & 0x80) == 0)
|
||||
buffer[i] = (char) b;
|
||||
else if ((b & 0xE0) != 0xE0) {
|
||||
buffer[i] = (char) (((b & 0x1F) << 6)
|
||||
| (in.read() & 0x3F));
|
||||
} else
|
||||
buffer[i] = (char) (((b & 0x0F) << 12)
|
||||
| ((in.read() & 0x3F) << 6)
|
||||
| (in.read() & 0x3F));
|
||||
}
|
||||
}
|
||||
|
||||
private int stringsCount = 0;
|
||||
private Map<String, Integer> stringsMap;
|
||||
private List<String> stringsList;
|
||||
|
|
|
@ -45,7 +45,7 @@ public class BinaryResponseParser extends ResponseParser {
|
|||
|
||||
|
||||
public String getVersion() {
|
||||
return "1";
|
||||
return "2";
|
||||
}
|
||||
|
||||
public NamedList<Object> processResponse(Reader reader) {
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.common.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestJavaBinCodec extends LuceneTestCase {
|
||||
|
||||
public void testStrings() throws Exception {
|
||||
Random r = newRandom();
|
||||
JavaBinCodec javabin = new JavaBinCodec();
|
||||
for (int i = 0; i < 10000*RANDOM_MULTIPLIER; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(r);
|
||||
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
||||
javabin.marshal(s, os);
|
||||
ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray());
|
||||
Object o = javabin.unmarshal(is);
|
||||
assertEquals(s, o);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue