simplify encode for surrogates

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@996893 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-09-14 13:36:47 +00:00
parent 038bf7e72a
commit 350963de8d
1 changed files with 7 additions and 3 deletions

View File

@ -106,6 +106,10 @@ final public class UnicodeUtil {
private static final long HALF_SHIFT = 10;
private static final long HALF_MASK = 0x3FFL;
private static final int SURROGATE_OFFSET =
Character.MIN_SUPPLEMENTARY_CODE_POINT -
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
/**
* @lucene.internal
*/
@ -167,7 +171,7 @@ final public class UnicodeUtil {
int utf32 = (int) source[i];
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
i++;
hash = 31*hash + (out[upto++] = (byte)(0xF0 | (utf32 >> 18)));
hash = 31*hash + (out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)));
@ -223,7 +227,7 @@ final public class UnicodeUtil {
int utf32 = (int) source[i];
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
i++;
out[upto++] = (byte)(0xF0 | (utf32 >> 18));
out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
@ -276,7 +280,7 @@ final public class UnicodeUtil {
int utf32 = (int) s.charAt(i+1);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
i++;
out[upto++] = (byte)(0xF0 | (utf32 >> 18));
out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));