mirror of https://github.com/apache/lucene.git
LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write strings larger than 64kb by an amount equal to string's utf8 size
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1703219 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fdefc995f9
commit
e91c414ba0
|
@ -114,6 +114,10 @@ Optimizations
|
|||
GeoPointTermsEnum to reduce GC pressure (Nick Knize via Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write
|
||||
strings larger than 64kb by an amount equal to string's utf8 size.
|
||||
(Dawid Weiss, Robert Muir, shalin)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-6730: Hyper-parameter c is ignored in term frequency NormalizationH1.
|
||||
|
|
|
@ -40,9 +40,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
|
@ -245,8 +243,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
numChunks++;
|
||||
}
|
||||
|
||||
byte scratchBytes[] = new byte[16];
|
||||
|
||||
@Override
|
||||
public void writeField(FieldInfo info, StorableField field)
|
||||
throws IOException {
|
||||
|
@ -293,11 +289,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
bufferedDocs.writeVInt(bytes.length);
|
||||
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
|
||||
} else if (string != null) {
|
||||
// this is just an optimized writeString() that re-uses scratchBytes.
|
||||
scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
|
||||
int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
|
||||
bufferedDocs.writeVInt(length);
|
||||
bufferedDocs.writeBytes(scratchBytes, length);
|
||||
bufferedDocs.writeString(string);
|
||||
} else {
|
||||
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
|
||||
bufferedDocs.writeZInt(number.intValue());
|
||||
|
|
|
@ -35,8 +35,6 @@ import org.apache.lucene.index.Fields;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.BufferedChecksumIndexInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -45,7 +43,6 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.GrowableByteArrayDataOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.util;
|
||||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,7 +17,11 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* A {@link DataOutput} that can be used to build a byte[].
|
||||
|
@ -25,11 +29,17 @@ import org.apache.lucene.store.DataOutput;
|
|||
*/
|
||||
public final class GrowableByteArrayDataOutput extends DataOutput {
|
||||
|
||||
/** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
|
||||
static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
|
||||
|
||||
/** The bytes */
|
||||
public byte[] bytes;
|
||||
/** The length */
|
||||
public int length;
|
||||
|
||||
// scratch for utf8 encoding of small strings
|
||||
byte[] scratchBytes = new byte[16];
|
||||
|
||||
/** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
|
||||
public GrowableByteArrayDataOutput(int cp) {
|
||||
this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
|
||||
|
@ -52,4 +62,22 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
|
|||
length = newLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeString(String string) throws IOException {
|
||||
int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
|
||||
if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
|
||||
// string is small enough that we don't need to save memory by falling back to double-pass approach
|
||||
// this is just an optimized writeString() that re-uses scratchBytes.
|
||||
scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
|
||||
int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
|
||||
writeVInt(len);
|
||||
writeBytes(scratchBytes, len);
|
||||
} else {
|
||||
// use a double pass approach to avoid allocating a large intermediate buffer for string encoding
|
||||
int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
|
||||
writeVInt(numBytes);
|
||||
bytes = ArrayUtil.grow(bytes, length + numBytes);
|
||||
length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -179,11 +179,21 @@ public final class UnicodeUtil {
|
|||
* for length characters. It is the responsibility of the
|
||||
* caller to make sure that the destination array is large enough.
|
||||
*/
|
||||
// TODO: broken if incoming result.offset != 0
|
||||
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
|
||||
return UTF16toUTF8(s, offset, length, out, 0);
|
||||
}
|
||||
|
||||
/** Encode characters from this String, starting at offset
|
||||
* for length characters. Output to the destination array
|
||||
* will begin at {@code outOffset}. It is the responsibility of the
|
||||
* caller to make sure that the destination array is large enough.
|
||||
* <p>
|
||||
* note this method returns the final output offset (outOffset + number of bytes written)
|
||||
*/
|
||||
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out, int outOffset) {
|
||||
final int end = offset + length;
|
||||
|
||||
int upto = 0;
|
||||
int upto = outOffset;
|
||||
for(int i=offset;i<end;i++) {
|
||||
final int code = (int) s.charAt(i);
|
||||
|
||||
|
@ -223,6 +233,43 @@ public final class UnicodeUtil {
|
|||
return upto;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of UTF8 bytes necessary to write a UTF16 string.
|
||||
*
|
||||
* @return the number of bytes written
|
||||
*/
|
||||
public static int calcUTF16toUTF8Length(final CharSequence s, final int offset, final int len) {
|
||||
final int end = offset + len;
|
||||
|
||||
int res = 0;
|
||||
for (int i = offset; i < end; i++) {
|
||||
final int code = (int) s.charAt(i);
|
||||
|
||||
if (code < 0x80)
|
||||
res++;
|
||||
else if (code < 0x800) {
|
||||
res += 2;
|
||||
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||
res += 3;
|
||||
} else {
|
||||
// surrogate pair
|
||||
// confirm valid high surrogate
|
||||
if (code < 0xDC00 && (i < end - 1)) {
|
||||
int utf32 = (int) s.charAt(i + 1);
|
||||
// confirm valid low surrogate and write pair
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
i++;
|
||||
res += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
res += 3;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// Only called from assert
|
||||
/*
|
||||
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Test for {@link GrowableByteArrayDataOutput}
|
||||
*/
|
||||
public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void testWriteSmallStrings() throws Exception {
|
||||
int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
|
||||
|
||||
// a simple string encoding test
|
||||
int num = atLeast(1000);
|
||||
for (int i = 0; i < num; i++) {
|
||||
// create a small string such that the single pass approach is used
|
||||
int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
|
||||
String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
|
||||
byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
|
||||
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
|
||||
|
||||
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
|
||||
//explicitly write utf8 len so that we know how many bytes it occupies
|
||||
dataOutput.writeVInt(len);
|
||||
int vintLen = dataOutput.length;
|
||||
// now write the string which will internally write number of bytes as a vint and then utf8 bytes
|
||||
dataOutput.writeString(unicode);
|
||||
|
||||
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
|
||||
for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
|
||||
assertEquals(utf8[j], dataOutput.bytes[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteLargeStrings() throws Exception {
|
||||
int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
|
||||
|
||||
int num = atLeast(1000);
|
||||
for (int i = 0; i < num; i++) {
|
||||
String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
|
||||
byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
|
||||
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
|
||||
|
||||
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
|
||||
//explicitly write utf8 len so that we know how many bytes it occupies
|
||||
dataOutput.writeVInt(len);
|
||||
int vintLen = dataOutput.length;
|
||||
// now write the string which will internally write number of bytes as a vint and then utf8 bytes
|
||||
dataOutput.writeString(unicode);
|
||||
|
||||
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
|
||||
for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
|
||||
assertEquals(utf8[j], dataOutput.bytes[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -214,4 +214,14 @@ public class TestUnicodeUtil extends LuceneTestCase {
|
|||
assertEquals(cRef.toString(), unicode);
|
||||
}
|
||||
}
|
||||
|
||||
public void testCalcUTF16toUTF8Length() {
|
||||
int num = atLeast(5000);
|
||||
for (int i = 0; i < num; i++) {
|
||||
String unicode = TestUtil.randomUnicodeString(random());
|
||||
byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
|
||||
int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
|
||||
assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue