LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write strings larger than 64kb by an amount equal to string's utf8 size

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1703219 13f79535-47bb-0310-9956-ffa450edef68
2015-09-15 15:00:31 +00:00 · 2015-09-15 15:00:31 +00:00 · e91c414ba0
parent fdefc995f9
commit e91c414ba0
7 changed files with 173 additions and 15 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -114,6 +114,10 @@ Optimizations
  GeoPointTermsEnum to reduce GC pressure (Nick Knize via Mike
  McCandless)

+* LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write
+  strings larger than 64kb by an amount equal to string's utf8 size.
+  (Dawid Weiss, Robert Muir, shalin)
+
 Bug Fixes

 * LUCENE-6730: Hyper-parameter c is ignored in term frequency NormalizationH1.
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
@ -40,9 +40,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BitUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.packed.PackedInts;

 /**
@ -245,8 +243,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
    numChunks++;
  }
  
-  byte scratchBytes[] = new byte[16];
-
  @Override
  public void writeField(FieldInfo info, StorableField field)
      throws IOException {
@ -293,11 +289,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
      bufferedDocs.writeVInt(bytes.length);
      bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
    } else if (string != null) {
-      // this is just an optimized writeString() that re-uses scratchBytes.
-      scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
-      int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
-      bufferedDocs.writeVInt(length);
-      bufferedDocs.writeBytes(scratchBytes, length);
+      bufferedDocs.writeString(string);
    } else {
      if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
        bufferedDocs.writeZInt(number.intValue());
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
@ -35,8 +35,6 @@ import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.store.BufferedChecksumIndexInput;
-import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@ -45,7 +43,6 @@ import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.packed.BlockPackedWriter;
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
@ -1,4 +1,4 @@
-package org.apache.lucene.util;
+package org.apache.lucene.codecs.compressing;

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,7 +17,11 @@ package org.apache.lucene.util;
 * limitations under the License.
 */

+import java.io.IOException;
+
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.UnicodeUtil;

 /**
 * A {@link DataOutput} that can be used to build a byte[].
@ -25,11 +29,17 @@ import org.apache.lucene.store.DataOutput;
 */
 public final class GrowableByteArrayDataOutput extends DataOutput {

+  /** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
+  static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
+
  /** The bytes */
  public byte[] bytes;
  /** The length */
  public int length;

+  // scratch for utf8 encoding of small strings
+  byte[] scratchBytes = new byte[16];
+
  /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
  public GrowableByteArrayDataOutput(int cp) {
    this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
@ -52,4 +62,22 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
    length = newLength;
  }

+  @Override
+  public void writeString(String string) throws IOException {
+    int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+    if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
+      // string is small enough that we don't need to save memory by falling back to double-pass approach
+      // this is just an optimized writeString() that re-uses scratchBytes.
+      scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
+      int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
+      writeVInt(len);
+      writeBytes(scratchBytes, len);
+    } else  {
+      // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
+      int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
+      writeVInt(numBytes);
+      bytes = ArrayUtil.grow(bytes, length + numBytes);
+      length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
+    }
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@ -179,11 +179,21 @@ public final class UnicodeUtil {
   *  for length characters. It is the responsibility of the
   *  caller to make sure that the destination array is large enough.
   */
-  // TODO: broken if incoming result.offset != 0
  public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
+    return UTF16toUTF8(s, offset, length, out, 0);
+  }
+
+  /** Encode characters from this String, starting at offset
+   *  for length characters. Output to the destination array
+   *  will begin at {@code outOffset}. It is the responsibility of the
+   *  caller to make sure that the destination array is large enough.
+   *  <p>
+   *  note this method returns the final output offset (outOffset + number of bytes written)
+   */
+  public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out, int outOffset) {
    final int end = offset + length;

-    int upto = 0;
+    int upto = outOffset;
    for(int i=offset;i<end;i++) {
      final int code = (int) s.charAt(i);

@ -223,6 +233,43 @@ public final class UnicodeUtil {
    return upto;
  }

+  /**
+   * Calculates the number of UTF8 bytes necessary to write a UTF16 string.
+   *
+   * @return the number of bytes written
+   */
+  public static int calcUTF16toUTF8Length(final CharSequence s, final int offset, final int len) {
+    final int end = offset + len;
+
+    int res = 0;
+    for (int i = offset; i < end; i++) {
+      final int code = (int) s.charAt(i);
+
+      if (code < 0x80)
+        res++;
+      else if (code < 0x800) {
+        res += 2;
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        res += 3;
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && (i < end - 1)) {
+          int utf32 = (int) s.charAt(i + 1);
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+            i++;
+            res += 4;
+            continue;
+          }
+        }
+        res += 3;
+      }
+    }
+
+    return res;
+  }
+
  // Only called from assert
  /*
  private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
--- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
@ -0,0 +1,80 @@
+package org.apache.lucene.codecs.compressing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.UnicodeUtil;
+import org.junit.Test;
+
+/**
+ * Test for {@link GrowableByteArrayDataOutput}
+ */
+public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
+
+  @Test
+  public void testWriteSmallStrings() throws Exception {
+    int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+    // a simple string encoding test
+    int num = atLeast(1000);
+    for (int i = 0; i < num; i++) {
+      // create a small string such that the single pass approach is used
+      int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
+      String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
+      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+      GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+      //explicitly write utf8 len so that we know how many bytes it occupies
+      dataOutput.writeVInt(len);
+      int vintLen = dataOutput.length;
+      // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+      dataOutput.writeString(unicode);
+
+      assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
+      for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+        assertEquals(utf8[j], dataOutput.bytes[k]);
+      }
+    }
+  }
+
+  @Test
+  public void testWriteLargeStrings() throws Exception {
+    int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+    int num = atLeast(1000);
+    for (int i = 0; i < num; i++) {
+      String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
+      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+      GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+      //explicitly write utf8 len so that we know how many bytes it occupies
+      dataOutput.writeVInt(len);
+      int vintLen = dataOutput.length;
+      // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+      dataOutput.writeString(unicode);
+
+      assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
+      for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+        assertEquals(utf8[j], dataOutput.bytes[k]);
+      }
+    }
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@ -214,4 +214,14 @@ public class TestUnicodeUtil extends LuceneTestCase {
      assertEquals(cRef.toString(), unicode);
    }
  }
+
+  public void testCalcUTF16toUTF8Length() {
+    int num = atLeast(5000);
+    for (int i = 0; i < num; i++) {
+      String unicode = TestUtil.randomUnicodeString(random());
+      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+      assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
+    }
+  }
 }