mirror of https://github.com/apache/lucene.git
SOLR-7971: JavaBinCodec now uses a double pass approach to write strings larger than 64KB to avoid allocating buffer memory equal to string's UTF8 size
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1701115 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ae23ea91d9
commit
05a9c3bed8
|
@ -210,9 +210,10 @@ Optimizations
|
||||||
are more efficient especially when cluster has a mix of collections in stateFormat=1
|
are more efficient especially when cluster has a mix of collections in stateFormat=1
|
||||||
and stateFormat=2. (Scott Blum, shalin)
|
and stateFormat=2. (Scott Blum, shalin)
|
||||||
|
|
||||||
* SOLR-7971: Reduce memory allocated by JavaBinCodec to encode large strings by an amount
|
* SOLR-7971: Reduce memory allocated by JavaBinCodec to encode small strings by an amount
|
||||||
equal to the string.length().
|
equal to the string.length(). JavaBinCodec now uses a double pass approach to write strings
|
||||||
(yonik, Steve Rowe, shalin)
|
larger than 64KB to avoid allocating buffer memory equal to string's UTF8 size.
|
||||||
|
(yonik, Steve Rowe, Mikhail Khludnev, Noble Paul, shalin)
|
||||||
|
|
||||||
* SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin)
|
* SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin)
|
||||||
|
|
||||||
|
|
|
@ -17,10 +17,16 @@
|
||||||
|
|
||||||
package org.apache.solr.common.util;
|
package org.apache.solr.common.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
|
||||||
import org.noggit.CharArr;
|
import org.noggit.CharArr;
|
||||||
|
|
||||||
public class ByteUtils {
|
public class ByteUtils {
|
||||||
|
|
||||||
|
/** Maximum number of UTF8 bytes per UTF16 character. */
|
||||||
|
public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
|
||||||
|
|
||||||
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
|
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
|
||||||
* Full characters are read, even if this reads past the length passed (and can result in
|
* Full characters are read, even if this reads past the length passed (and can result in
|
||||||
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
|
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
|
||||||
|
@ -121,6 +127,100 @@ public class ByteUtils {
|
||||||
return upto - resultOffset;
|
return upto - resultOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Writes UTF8 into the given OutputStream by first writing to the given scratch array
|
||||||
|
* and then writing the contents of the scratch array to the OutputStream. The given scratch byte array
|
||||||
|
* is used to buffer intermediate data before it is written to the byte buffer.
|
||||||
|
*
|
||||||
|
* @return the number of bytes written
|
||||||
|
*/
|
||||||
|
public static int writeUTF16toUTF8(CharSequence s, int offset, int len, OutputStream fos, byte[] scratch) throws IOException {
|
||||||
|
final int end = offset + len;
|
||||||
|
|
||||||
|
int upto = 0, totalBytes = 0;
|
||||||
|
for(int i=offset;i<end;i++) {
|
||||||
|
final int code = (int) s.charAt(i);
|
||||||
|
|
||||||
|
if (upto > scratch.length - 4) {
|
||||||
|
// a code point may take upto 4 bytes and we don't have enough space, so reset
|
||||||
|
totalBytes += upto;
|
||||||
|
fos.write(scratch, 0, upto);
|
||||||
|
upto = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code < 0x80)
|
||||||
|
scratch[upto++] = (byte) code;
|
||||||
|
else if (code < 0x800) {
|
||||||
|
scratch[upto++] = (byte) (0xC0 | (code >> 6));
|
||||||
|
scratch[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||||
|
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||||
|
scratch[upto++] = (byte)(0xE0 | (code >> 12));
|
||||||
|
scratch[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
|
||||||
|
scratch[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||||
|
} else {
|
||||||
|
// surrogate pair
|
||||||
|
// confirm valid high surrogate
|
||||||
|
if (code < 0xDC00 && (i < end-1)) {
|
||||||
|
int utf32 = (int) s.charAt(i+1);
|
||||||
|
// confirm valid low surrogate and write pair
|
||||||
|
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||||
|
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
|
||||||
|
i++;
|
||||||
|
scratch[upto++] = (byte)(0xF0 | (utf32 >> 18));
|
||||||
|
scratch[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
|
||||||
|
scratch[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
|
||||||
|
scratch[upto++] = (byte)(0x80 | (utf32 & 0x3F));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// replace unpaired surrogate or out-of-order low surrogate
|
||||||
|
// with substitution character
|
||||||
|
scratch[upto++] = (byte) 0xEF;
|
||||||
|
scratch[upto++] = (byte) 0xBF;
|
||||||
|
scratch[upto++] = (byte) 0xBD;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
totalBytes += upto;
|
||||||
|
fos.write(scratch, 0, upto);
|
||||||
|
|
||||||
|
return totalBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of UTF8 bytes necessary to write a UTF16 string.
|
||||||
|
*
|
||||||
|
* @return the number of bytes written
|
||||||
|
*/
|
||||||
|
public static int calcUTF16toUTF8Length(CharSequence s, int offset, int len) {
|
||||||
|
final int end = offset + len;
|
||||||
|
|
||||||
|
int res = 0;
|
||||||
|
for (int i = offset; i < end; i++) {
|
||||||
|
final int code = (int) s.charAt(i);
|
||||||
|
|
||||||
|
if (code < 0x80)
|
||||||
|
res++;
|
||||||
|
else if (code < 0x800) {
|
||||||
|
res += 2;
|
||||||
|
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||||
|
res += 3;
|
||||||
|
} else {
|
||||||
|
// surrogate pair
|
||||||
|
// confirm valid high surrogate
|
||||||
|
if (code < 0xDC00 && (i < end - 1)) {
|
||||||
|
int utf32 = (int) s.charAt(i + 1);
|
||||||
|
// confirm valid low surrogate and write pair
|
||||||
|
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||||
|
i++;
|
||||||
|
res += 4;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res += 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,6 +77,8 @@ public class JavaBinCodec {
|
||||||
NAMED_LST = (byte) (6 << 5), // NamedList
|
NAMED_LST = (byte) (6 << 5), // NamedList
|
||||||
EXTERN_STRING = (byte) (7 << 5);
|
EXTERN_STRING = (byte) (7 << 5);
|
||||||
|
|
||||||
|
private static final int MAX_UTF8_SIZE_FOR_ARRAY_GROW_STRATEGY = 65536;
|
||||||
|
|
||||||
|
|
||||||
private static byte VERSION = 2;
|
private static byte VERSION = 2;
|
||||||
private final ObjectResolver resolver;
|
private final ObjectResolver resolver;
|
||||||
|
@ -614,12 +616,20 @@ public class JavaBinCodec {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int end = s.length();
|
int end = s.length();
|
||||||
int maxSize = end * 3; // 3 is enough, see SOLR-7971
|
int maxSize = end * ByteUtils.MAX_UTF8_BYTES_PER_CHAR;
|
||||||
if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
|
|
||||||
int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
|
|
||||||
|
|
||||||
writeTag(STR, sz);
|
if (maxSize <= MAX_UTF8_SIZE_FOR_ARRAY_GROW_STRATEGY) {
|
||||||
daos.write(bytes, 0, sz);
|
if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
|
||||||
|
int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
|
||||||
|
writeTag(STR, sz);
|
||||||
|
daos.write(bytes, 0, sz);
|
||||||
|
} else {
|
||||||
|
// double pass logic for large strings, see SOLR-7971
|
||||||
|
int sz = ByteUtils.calcUTF16toUTF8Length(s, 0, end);
|
||||||
|
writeTag(STR, sz);
|
||||||
|
if (bytes == null || bytes.length < 8192) bytes = new byte[8192];
|
||||||
|
ByteUtils.writeUTF16toUTF8(s, 0, end, daos, bytes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] bytes;
|
byte[] bytes;
|
||||||
|
|
Loading…
Reference in New Issue