SOLR-7971: JavaBinCodec now uses a double pass approach to write strings larger than 64KB to avoid allocating buffer memory equal to string's UTF8 size

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1701115 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2015-09-03 19:42:38 +00:00
parent ae23ea91d9
commit 05a9c3bed8
3 changed files with 119 additions and 8 deletions

View File

@ -210,9 +210,10 @@ Optimizations
are more efficient especially when cluster has a mix of collections in stateFormat=1 are more efficient especially when cluster has a mix of collections in stateFormat=1
and stateFormat=2. (Scott Blum, shalin) and stateFormat=2. (Scott Blum, shalin)
* SOLR-7971: Reduce memory allocated by JavaBinCodec to encode large strings by an amount * SOLR-7971: Reduce memory allocated by JavaBinCodec to encode small strings by an amount
equal to the string.length(). equal to the string.length(). JavaBinCodec now uses a double pass approach to write strings
(yonik, Steve Rowe, shalin) larger than 64KB to avoid allocating buffer memory equal to string's UTF8 size.
(yonik, Steve Rowe, Mikhail Khludnev, Noble Paul, shalin)
* SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin) * SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin)

View File

@ -17,10 +17,16 @@
package org.apache.solr.common.util; package org.apache.solr.common.util;
import java.io.IOException;
import java.io.OutputStream;
import org.noggit.CharArr; import org.noggit.CharArr;
public class ByteUtils { public class ByteUtils {
/** Maximum number of UTF8 bytes per UTF16 character. */
public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written. /** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
* Full characters are read, even if this reads past the length passed (and can result in * Full characters are read, even if this reads past the length passed (and can result in
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed. * an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
@ -121,6 +127,100 @@ public class ByteUtils {
return upto - resultOffset; return upto - resultOffset;
} }
/** Writes UTF8 into the given OutputStream by first writing to the given scratch array
* and then writing the contents of the scratch array to the OutputStream. The given scratch byte array
* is used to buffer intermediate data before it is written to the byte buffer.
*
* @return the number of bytes written
*/
public static int writeUTF16toUTF8(CharSequence s, int offset, int len, OutputStream fos, byte[] scratch) throws IOException {
final int end = offset + len;
int upto = 0, totalBytes = 0;
for(int i=offset;i<end;i++) {
final int code = (int) s.charAt(i);
if (upto > scratch.length - 4) {
// a code point may take upto 4 bytes and we don't have enough space, so reset
totalBytes += upto;
fos.write(scratch, 0, upto);
upto = 0;
}
if (code < 0x80)
scratch[upto++] = (byte) code;
else if (code < 0x800) {
scratch[upto++] = (byte) (0xC0 | (code >> 6));
scratch[upto++] = (byte)(0x80 | (code & 0x3F));
} else if (code < 0xD800 || code > 0xDFFF) {
scratch[upto++] = (byte)(0xE0 | (code >> 12));
scratch[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
scratch[upto++] = (byte)(0x80 | (code & 0x3F));
} else {
// surrogate pair
// confirm valid high surrogate
if (code < 0xDC00 && (i < end-1)) {
int utf32 = (int) s.charAt(i+1);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
i++;
scratch[upto++] = (byte)(0xF0 | (utf32 >> 18));
scratch[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
scratch[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
scratch[upto++] = (byte)(0x80 | (utf32 & 0x3F));
continue;
}
}
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
scratch[upto++] = (byte) 0xEF;
scratch[upto++] = (byte) 0xBF;
scratch[upto++] = (byte) 0xBD;
}
}
totalBytes += upto;
fos.write(scratch, 0, upto);
return totalBytes;
}
/**
* Calculates the number of UTF8 bytes necessary to write a UTF16 string.
*
* @return the number of bytes written
*/
public static int calcUTF16toUTF8Length(CharSequence s, int offset, int len) {
final int end = offset + len;
int res = 0;
for (int i = offset; i < end; i++) {
final int code = (int) s.charAt(i);
if (code < 0x80)
res++;
else if (code < 0x800) {
res += 2;
} else if (code < 0xD800 || code > 0xDFFF) {
res += 3;
} else {
// surrogate pair
// confirm valid high surrogate
if (code < 0xDC00 && (i < end - 1)) {
int utf32 = (int) s.charAt(i + 1);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
i++;
res += 4;
continue;
}
}
res += 3;
}
}
return res;
}
} }

View File

@ -77,6 +77,8 @@ public class JavaBinCodec {
NAMED_LST = (byte) (6 << 5), // NamedList NAMED_LST = (byte) (6 << 5), // NamedList
EXTERN_STRING = (byte) (7 << 5); EXTERN_STRING = (byte) (7 << 5);
private static final int MAX_UTF8_SIZE_FOR_ARRAY_GROW_STRATEGY = 65536;
private static byte VERSION = 2; private static byte VERSION = 2;
private final ObjectResolver resolver; private final ObjectResolver resolver;
@ -614,12 +616,20 @@ public class JavaBinCodec {
return; return;
} }
int end = s.length(); int end = s.length();
int maxSize = end * 3; // 3 is enough, see SOLR-7971 int maxSize = end * ByteUtils.MAX_UTF8_BYTES_PER_CHAR;
if (maxSize <= MAX_UTF8_SIZE_FOR_ARRAY_GROW_STRATEGY) {
if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize]; if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0); int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
writeTag(STR, sz); writeTag(STR, sz);
daos.write(bytes, 0, sz); daos.write(bytes, 0, sz);
} else {
// double pass logic for large strings, see SOLR-7971
int sz = ByteUtils.calcUTF16toUTF8Length(s, 0, end);
writeTag(STR, sz);
if (bytes == null || bytes.length < 8192) bytes = new byte[8192];
ByteUtils.writeUTF16toUTF8(s, 0, end, daos, bytes);
}
} }
byte[] bytes; byte[] bytes;