LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write strings larger than 64kb by an amount equal to string's utf8 size

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1703219 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2015-09-15 15:00:31 +00:00
parent fdefc995f9
commit e91c414ba0
7 changed files with 173 additions and 15 deletions

View File

@ -114,6 +114,10 @@ Optimizations
GeoPointTermsEnum to reduce GC pressure (Nick Knize via Mike
McCandless)
* LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write
strings larger than 64kb by an amount equal to string's utf8 size.
(Dawid Weiss, Robert Muir, shalin)
Bug Fixes
* LUCENE-6730: Hyper-parameter c is ignored in term frequency NormalizationH1.

View File

@ -40,9 +40,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts;
/**
@ -245,8 +243,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
numChunks++;
}
byte scratchBytes[] = new byte[16];
@Override
public void writeField(FieldInfo info, StorableField field)
throws IOException {
@ -293,11 +289,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) {
// this is just an optimized writeString() that re-uses scratchBytes.
scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
bufferedDocs.writeVInt(length);
bufferedDocs.writeBytes(scratchBytes, length);
bufferedDocs.writeString(string);
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeZInt(number.intValue());

View File

@ -35,8 +35,6 @@ import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -45,7 +43,6 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.util;
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,7 +17,11 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.UnicodeUtil;
/**
* A {@link DataOutput} that can be used to build a byte[].
@ -25,11 +29,17 @@ import org.apache.lucene.store.DataOutput;
*/
public final class GrowableByteArrayDataOutput extends DataOutput {
/** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
/** The bytes */
public byte[] bytes;
/** The length */
public int length;
// scratch for utf8 encoding of small strings
byte[] scratchBytes = new byte[16];
/** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
public GrowableByteArrayDataOutput(int cp) {
this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
@ -52,4 +62,22 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
length = newLength;
}
@Override
public void writeString(String string) throws IOException {
int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
// string is small enough that we don't need to save memory by falling back to double-pass approach
// this is just an optimized writeString() that re-uses scratchBytes.
scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
writeVInt(len);
writeBytes(scratchBytes, len);
} else {
// use a double pass approach to avoid allocating a large intermediate buffer for string encoding
int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
writeVInt(numBytes);
bytes = ArrayUtil.grow(bytes, length + numBytes);
length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
}
}
}

View File

@ -179,11 +179,21 @@ public final class UnicodeUtil {
* for length characters. It is the responsibility of the
* caller to make sure that the destination array is large enough.
*/
// TODO: broken if incoming result.offset != 0
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
return UTF16toUTF8(s, offset, length, out, 0);
}
/** Encode characters from this String, starting at offset
* for length characters. Output to the destination array
* will begin at {@code outOffset}. It is the responsibility of the
* caller to make sure that the destination array is large enough.
* <p>
* note this method returns the final output offset (outOffset + number of bytes written)
*/
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out, int outOffset) {
final int end = offset + length;
int upto = 0;
int upto = outOffset;
for(int i=offset;i<end;i++) {
final int code = (int) s.charAt(i);
@ -223,6 +233,43 @@ public final class UnicodeUtil {
return upto;
}
/**
* Calculates the number of UTF8 bytes necessary to write a UTF16 string.
*
* @return the number of bytes written
*/
public static int calcUTF16toUTF8Length(final CharSequence s, final int offset, final int len) {
final int end = offset + len;
int res = 0;
for (int i = offset; i < end; i++) {
final int code = (int) s.charAt(i);
if (code < 0x80)
res++;
else if (code < 0x800) {
res += 2;
} else if (code < 0xD800 || code > 0xDFFF) {
res += 3;
} else {
// surrogate pair
// confirm valid high surrogate
if (code < 0xDC00 && (i < end - 1)) {
int utf32 = (int) s.charAt(i + 1);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
i++;
res += 4;
continue;
}
}
res += 3;
}
}
return res;
}
// Only called from assert
/*
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {

View File

@ -0,0 +1,80 @@
package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.junit.Test;
/**
* Test for {@link GrowableByteArrayDataOutput}
*/
public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
@Test
public void testWriteSmallStrings() throws Exception {
int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
// a simple string encoding test
int num = atLeast(1000);
for (int i = 0; i < num; i++) {
// create a small string such that the single pass approach is used
int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
//explicitly write utf8 len so that we know how many bytes it occupies
dataOutput.writeVInt(len);
int vintLen = dataOutput.length;
// now write the string which will internally write number of bytes as a vint and then utf8 bytes
dataOutput.writeString(unicode);
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
assertEquals(utf8[j], dataOutput.bytes[k]);
}
}
}
@Test
public void testWriteLargeStrings() throws Exception {
int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
int num = atLeast(1000);
for (int i = 0; i < num; i++) {
String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
//explicitly write utf8 len so that we know how many bytes it occupies
dataOutput.writeVInt(len);
int vintLen = dataOutput.length;
// now write the string which will internally write number of bytes as a vint and then utf8 bytes
dataOutput.writeString(unicode);
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
assertEquals(utf8[j], dataOutput.bytes[k]);
}
}
}
}

View File

@ -214,4 +214,14 @@ public class TestUnicodeUtil extends LuceneTestCase {
assertEquals(cRef.toString(), unicode);
}
}
public void testCalcUTF16toUTF8Length() {
int num = atLeast(5000);
for (int i = 0; i < num; i++) {
String unicode = TestUtil.randomUnicodeString(random());
byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
}
}
}