LUCENE-4554: CompressingStoredFieldsFormat: don't write the original length at the beginning of the chunk with LZ4 compressors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1408476 13f79535-47bb-0310-9956-ffa450edef68
2012-11-12 22:17:13 +00:00 · 2012-11-12 22:17:13 +00:00 · f1fda6880b
parent 6794693f30
commit f1fda6880b
5 changed files with 150 additions and 138 deletions
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@ -197,11 +197,12 @@ final class CompressingStoredFieldsReader extends StoredFieldsReader {
          + ", numDocs=" + numDocs);
    }

-    final int numStoredFields, offset, length;
+    final int numStoredFields, offset, length, totalLength;
    if (chunkDocs == 1) {
      numStoredFields = fieldsStream.readVInt();
      offset = 0;
      length = fieldsStream.readVInt();
+      totalLength = length;
    } else {
      final int bitsPerStoredFields = fieldsStream.readVInt();
      if (bitsPerStoredFields == 0) {
@ -219,10 +220,10 @@ final class CompressingStoredFieldsReader extends StoredFieldsReader {
      if (bitsPerLength == 0) {
        length = fieldsStream.readVInt();
        offset = (docID - docBase) * length;
+        totalLength = chunkDocs * length;
      } else if (bitsPerStoredFields > 31) {
        throw new CorruptIndexException("bitsPerLength=" + bitsPerLength);
      } else {
-        final long filePointer = fieldsStream.getFilePointer();
        final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
        int off = 0;
        for (int i = 0; i < docID - docBase; ++i) {
@ -230,7 +231,11 @@ final class CompressingStoredFieldsReader extends StoredFieldsReader {
        }
        offset = off;
        length = (int) it.next();
-        fieldsStream.seek(filePointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerLength));
+        off += length;
+        for (int i = docID - docBase + 1; i < chunkDocs; ++i) {
+          off += it.next();
+        }
+        totalLength = off;
      }
    }

@ -242,13 +247,14 @@ final class CompressingStoredFieldsReader extends StoredFieldsReader {
      return;
    }

-    decompressor.decompress(fieldsStream, offset, length, bytes);
+    decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
+    assert bytes.length == length;

    final ByteArrayDataInput documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
    for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) {
      final long infoAndBits = documentInput.readVLong();
      final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
-      FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
+      final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);

      final int bits = (int) (infoAndBits & TYPE_MASK);
      assert bits <= NUMERIC_DOUBLE: "bits=" + Integer.toHexString(bits);
@ -365,12 +371,23 @@ final class CompressingStoredFieldsReader extends StoredFieldsReader {
      }
    }

+    // total length of the chunk
+    private int totalLength() {
+      int totalLength = 0;
+      for (int i = 0; i < chunkDocs; ++i) {
+        totalLength += lengths[i];
+      }
+      return totalLength;
+    }
+
    /**
     * Decompress the chunk.
     */
    void decompress() throws IOException {
      // decompress data
-      decompressor.decompress(fieldsStream, bytes);
+      final int totalLength = totalLength();
+      decompressor.decompress(fieldsStream, totalLength, 0, totalLength, bytes);
+      assert bytes.length == totalLength;
      if (bytes.length != chunkSize()) {
        throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length);
      }
@ -380,7 +397,8 @@ final class CompressingStoredFieldsReader extends StoredFieldsReader {
     * Copy compressed data.
     */
    void copyCompressedData(DataOutput out) throws IOException {
-      decompressor.copyCompressedData(fieldsStream, out);
+      final int totalLength = totalLength();
+      decompressor.copyCompressedData(fieldsStream, totalLength, out);
    }

  }
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java
@ -22,6 +22,7 @@ import java.util.zip.DataFormatException;
 import java.util.zip.Deflater;
 import java.util.zip.Inflater;

+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.ArrayUtil;
@ -94,6 +95,7 @@ public enum CompressionMode {

  };

+  /** Get a {@link CompressionMode} according to its id. */
  public static CompressionMode byId(int id) {
    for (CompressionMode mode : CompressionMode.values()) {
      if (mode.getId() == id) {
@ -132,85 +134,25 @@ public enum CompressionMode {
  private static final Decompressor LZ4_DECOMPRESSOR = new Decompressor() {

    @Override
-    public void decompress(DataInput in, BytesRef bytes) throws IOException {
-      final int decompressedLen = in.readVInt();
-      if (bytes.bytes.length < decompressedLen + 8) {
-        bytes.bytes = ArrayUtil.grow(bytes.bytes, decompressedLen + 8);
+    public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException {
+      assert offset + length <= originalLength;
+      // add 7 padding bytes, this is not necessary but can help decompression run faster
+      if (bytes.bytes.length < originalLength + 7) {
+        bytes.bytes = new byte[ArrayUtil.oversize(originalLength + 7, 1)];
      }
-      LZ4.decompress(in, decompressedLen, bytes);
-      if (bytes.length != decompressedLen) {
-        throw new IOException("Corrupted");
+      final int decompressedLength = LZ4.decompress(in, offset + length, bytes.bytes, 0);
+      if (decompressedLength > originalLength) {
+        throw new CorruptIndexException("Corrupted: lengths mismatch: " + decompressedLength + " > " + originalLength);
      }
+      bytes.offset = offset;
+      bytes.length = length;
    }

    @Override
-    public void decompress(DataInput in, int offset, int length, BytesRef bytes) throws IOException {
-      final int decompressedLen = in.readVInt();
-      if (offset > decompressedLen) {
-        bytes.length = 0;
-        return;
-      }
-      if (bytes.bytes.length < decompressedLen) {
-        bytes.bytes = ArrayUtil.grow(bytes.bytes, decompressedLen);
-      }
-      LZ4.decompress(in, offset + length, bytes);
-      bytes.offset = offset;
-      if (offset + length >= decompressedLen) {
-        if (bytes.length != decompressedLen) {
-          throw new IOException("Corrupted");
-        }
-        bytes.length = decompressedLen - offset;
-      } else {
-        bytes.length = length;
-      }
-    }
-
-    public void copyCompressedData(DataInput in, DataOutput out) throws IOException {
-      final int decompressedLen = in.readVInt();
-      out.writeVInt(decompressedLen);
-      if (decompressedLen == 0) {
-        out.writeByte((byte) 0); // the token
-        return;
-      }
-      int n = 0;
-      while (n < decompressedLen) {
-        // literals
-        final byte token = in.readByte();
-        out.writeByte(token);
-        int literalLen = (token & 0xFF) >>> 4;
-        if (literalLen == 0x0F) {
-          byte len;
-          while ((len = in.readByte()) == (byte) 0xFF) {
-            literalLen += 0xFF;
-            out.writeByte(len);
-          }
-          literalLen += len & 0xFF;
-          out.writeByte(len);
-        }
-        out.copyBytes(in, literalLen);
-        n += literalLen;
-        if (n >= decompressedLen) {
-          break;
-        }
-
-        // matchs
-        out.copyBytes(in, 2); // match dec
-        int matchLen = token & 0x0F;
-        if (matchLen == 0x0F) {
-          byte len;
-          while ((len = in.readByte()) == (byte) 0xFF) {
-            matchLen += 0xFF;
-            out.writeByte(len);
-          }
-          matchLen += len & 0xFF;
-          out.writeByte(len);
-        }
-        matchLen += LZ4.MIN_MATCH;
-        n += matchLen;
-      }
-
-      if (n != decompressedLen) {
-        throw new IOException("Currupted compressed stream: expected " + decompressedLen + " bytes, but got at least" + n);
+    public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException {
+      final int copied = LZ4.copyCompressedData(in, originalLength, out);
+      if (copied != originalLength) {
+        throw new CorruptIndexException("Currupted compressed stream: expected " + originalLength + " bytes, but got at least" + copied);
      }
    }

@ -226,7 +168,6 @@ public enum CompressionMode {
    @Override
    public void compress(byte[] bytes, int off, int len, DataOutput out)
        throws IOException {
-      out.writeVInt(len);
      LZ4.compress(bytes, off, len, out);
    }

@ -237,7 +178,6 @@ public enum CompressionMode {
    @Override
    public void compress(byte[] bytes, int off, int len, DataOutput out)
        throws IOException {
-      out.writeVInt(len);
      LZ4.compressHC(bytes, off, len, out);
    }

@ -254,21 +194,22 @@ public enum CompressionMode {
    }

    @Override
-    public void decompress(DataInput in, BytesRef bytes) throws IOException {
-      bytes.offset = bytes.length = 0;
-
+    public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException {
+      assert offset + length <= originalLength;
+      if (length == 0) {
+        bytes.length = 0;
+        return;
+      }
      final int compressedLength = in.readVInt();
      if (compressedLength > compressed.length) {
-        compressed = ArrayUtil.grow(compressed, compressedLength);
+        compressed = new byte[ArrayUtil.oversize(compressedLength, 1)];
      }
      in.readBytes(compressed, 0, compressedLength);

      decompressor.reset();
      decompressor.setInput(compressed, 0, compressedLength);
-      if (decompressor.needsInput()) {
-        return;
-      }

+      bytes.offset = bytes.length = 0;
      while (true) {
        final int count;
        try {
@ -284,10 +225,15 @@ public enum CompressionMode {
          bytes.bytes = ArrayUtil.grow(bytes.bytes);
        }
      }
+      if (bytes.length != originalLength) {
+        throw new CorruptIndexException("Lengths mismatch: " + bytes.length + " != " + originalLength);
+      }
+      bytes.offset = offset;
+      bytes.length = length;
    }

    @Override
-    public void copyCompressedData(DataInput in, DataOutput out) throws IOException {
+    public void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException {
      final int compressedLength = in.readVInt();
      out.writeVInt(compressedLength);
      out.copyBytes(in, compressedLength);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/Decompressor.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/Decompressor.java
@ -29,26 +29,24 @@ import org.apache.lucene.util.BytesRef;
 abstract class Decompressor implements Cloneable {

  /**
-   * Decompress bytes. This method is free to resize <code>bytes</code> in case
-   * it is too small to hold all the decompressed data.
+   * Decompress bytes that were stored between offsets <code>offset</code> and
+   * <code>offset+length</code> in the original stream from the compressed
+   * stream <code>in</code> to <code>bytes</code>. After returning, the length
+   * of <code>bytes</code> (<code>bytes.length</code>) must be equal to
+   * <code>length</code>. Implementations of this method are free to resize
+   * <code>bytes</code> depending on their needs.
+   *
+   * @param in the input that stores the compressed stream
+   * @param originalLength the length of the original data (before compression)
+   * @param offset bytes before this offset do not need to be decompressed
+   * @param length bytes after <code>offset+length</code> do not need to be decompressed
+   * @param bytes a {@link BytesRef} where to store the decompressed data
   */
-  public abstract void decompress(DataInput in, BytesRef bytes) throws IOException;
+  public abstract void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException;

-  /**
-   * Method to use if you are only interested into <code>length</code>
-   * decompressed bytes starting at offset <code>offset</code>. Some compression
-   * codecs might have optimizations for this special case.
-   */
-  public void decompress(DataInput in, int offset, int length, BytesRef bytes) throws IOException {
-    decompress(in, bytes);
-    if (bytes.length < offset + length) {
-      throw new IndexOutOfBoundsException((offset + length) + " > " + bytes.length);
-    }
-    bytes.offset += offset;
-    bytes.length = length;
-  }
-
-  public abstract void copyCompressedData(DataInput in, DataOutput out) throws IOException;
+  /** Copy a compressed stream whose original length is
+   * <code>originalLength</code> from <code>in</code> to <code>out</code>. */
+  public abstract void copyCompressedData(DataInput in, int originalLength, DataOutput out) throws IOException;

  @Override
  public abstract Decompressor clone();
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/LZ4.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/LZ4.java
@ -22,7 +22,6 @@ import java.util.Arrays;

 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.packed.PackedInts;

 /**
@ -78,17 +77,15 @@ class LZ4 {
  }

  /**
-   * Decompress at least <code>decompressedLen</code> bytes into <code>destBytes</code>.
-   * Please note that <code>destBytes</code> must be large enough to be able to hold
-   * <b>all</b> decompressed data plus 8 bytes (meaning that you need to know the total
-   * decompressed length).
+   * Decompress at least <code>decompressedLen</code> bytes into
+   * <code>dest[dOff:]</code>. Please note that <code>dest</code> must be large
+   * enough to be able to hold <b>all</b> decompressed data (meaning that you
+   * need to know the total decompressed length).
   */
-  public static void decompress(DataInput compressed, int decompressedLen, BytesRef destBytes) throws IOException {
-    final byte[] dest = destBytes.bytes;
+  public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException {
    final int destEnd = dest.length;
-    int dOff = 0;

-    while (dOff < decompressedLen) {
+    do {
      // literals
      final int token = compressed.readByte() & 0xFF;
      int literalLen = token >>> 4;
@ -135,9 +132,9 @@ class LZ4 {
        System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen);
        dOff += matchLen;
      }
-    }
-    destBytes.offset = 0;
-    destBytes.length = dOff;
+    } while (dOff < decompressedLen);
+
+    return dOff;
  }

  private static void encodeLen(int l, DataOutput out) throws IOException {
@ -186,8 +183,7 @@ class LZ4 {

  /**
   * Compress <code>bytes[off:off+len]</code> into <code>out</code> using
-   * 2<sup>hashLog</sup> bytes of memory. Higher values of <code>hashLog</code>
-   * improve the compression ratio.
+   * at most 16KB of memory.
   */
  public static void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {

@ -357,6 +353,13 @@ class LZ4 {

  }

+  /**
+   * Compress <code>bytes[off:off+len]</code> into <code>out</code>. Compared to
+   * {@link LZ4#compress(byte[], int, int, DataOutput)}, this method is slower,
+   * uses more memory (~ 256KB), but should provide better compression ratios
+   * (especially on large inputs) because it chooses the best match among up to
+   * 256 candidates and then performs trade-offs to fix overlapping matches.
+   */
  public static void compressHC(byte[] src, int srcOff, int srcLen, DataOutput out) throws IOException {

    final int srcEnd = srcOff + srcLen;
@ -503,4 +506,51 @@ class LZ4 {
    encodeLastLiterals(src, anchor, srcEnd - anchor, out);
  }

+  /** Copy bytes from <code>in</code> to <code>out</code> where
+   *  <code>in</code> is a LZ4-encoded stream. This method copies enough bytes
+   *  so that <code>out</code> can be used later on to restore the first
+   *  <code>length</code> bytes of the stream. This method always reads at
+   *  least one byte from <code>in</code> so make sure not to call this method
+   *  if <code>in</code> reached the end of the stream, even if
+   *  <code>length=0</code>. */
+  public static int copyCompressedData(DataInput in, int length, DataOutput out) throws IOException {
+    int n = 0;
+    do {
+      // literals
+      final byte token = in.readByte();
+      out.writeByte(token);
+      int literalLen = (token & 0xFF) >>> 4;
+      if (literalLen == 0x0F) {
+        byte len;
+        while ((len = in.readByte()) == (byte) 0xFF) {
+          literalLen += 0xFF;
+          out.writeByte(len);
+        }
+        literalLen += len & 0xFF;
+        out.writeByte(len);
+      }
+      out.copyBytes(in, literalLen);
+      n += literalLen;
+      if (n >= length) {
+        break;
+      }
+
+      // matchs
+      out.copyBytes(in, 2); // match dec
+      int matchLen = token & 0x0F;
+      if (matchLen == 0x0F) {
+        byte len;
+        while ((len = in.readByte()) == (byte) 0xFF) {
+          matchLen += 0xFF;
+          out.writeByte(len);
+        }
+        matchLen += len & 0xFF;
+        out.writeByte(len);
+      }
+      matchLen += MIN_MATCH;
+      n += matchLen;
+    } while (n < length);
+    return n;
+  }
+
 }
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
@ -62,38 +62,38 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
    return Arrays.copyOf(compressed, compressedLen);
  }

-  byte[] decompress(byte[] compressed) throws IOException {
+  byte[] decompress(byte[] compressed, int originalLength) throws IOException {
    Decompressor decompressor = mode.newDecompressor();
-    return decompress(decompressor, compressed);
+    return decompress(decompressor, compressed, originalLength);
  }

-  static byte[] decompress(Decompressor decompressor, byte[] compressed) throws IOException {
+  static byte[] decompress(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException {
    final BytesRef bytes = new BytesRef();
-    decompressor.decompress(new ByteArrayDataInput(compressed), bytes);
+    decompressor.decompress(new ByteArrayDataInput(compressed), originalLength, 0, originalLength, bytes);
    return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
  }

-  byte[] decompress(byte[] compressed, int offset, int length) throws IOException {
+  byte[] decompress(byte[] compressed, int originalLength, int offset, int length) throws IOException {
    Decompressor decompressor = mode.newDecompressor();
    final BytesRef bytes = new BytesRef();
-    decompressor.decompress(new ByteArrayDataInput(compressed), offset, length, bytes);
+    decompressor.decompress(new ByteArrayDataInput(compressed), originalLength, offset, length, bytes);
    return Arrays.copyOfRange(bytes.bytes, bytes.offset, bytes.offset + bytes.length);
  }

-  static byte[] copyCompressedData(Decompressor decompressor, byte[] compressed) throws IOException {
+  static byte[] copyCompressedData(Decompressor decompressor, byte[] compressed, int originalLength) throws IOException {
    GrowableByteArrayDataOutput out = new GrowableByteArrayDataOutput(compressed.length);
-    decompressor.copyCompressedData(new ByteArrayDataInput(compressed), out);
+    decompressor.copyCompressedData(new ByteArrayDataInput(compressed), originalLength, out);
    return Arrays.copyOf(out.bytes, out.length);
  }

-  byte[] copyCompressedData(byte[] compressed) throws IOException {
-    return copyCompressedData(mode.newDecompressor(), compressed);
+  byte[] copyCompressedData(byte[] compressed, int originalLength) throws IOException {
+    return copyCompressedData(mode.newDecompressor(), compressed, originalLength);
  }

  public void testDecompress() throws IOException {
    final byte[] decompressed = randomArray();
    final byte[] compressed = compress(decompressed);
-    final byte[] restored = decompress(compressed);
+    final byte[] restored = decompress(compressed, decompressed.length);
    assertArrayEquals(decompressed, restored);
  }

@ -109,7 +109,7 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
        offset = random().nextInt(decompressed.length);
        length = random().nextInt(decompressed.length - offset);
      }
-      final byte[] restored = decompress(compressed, offset, length);
+      final byte[] restored = decompress(compressed, decompressed.length, offset, length);
      assertArrayEquals(Arrays.copyOfRange(decompressed, offset, offset + length), restored);
    }
  }
@ -117,14 +117,14 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
  public void testCopyCompressedData() throws IOException {
    final byte[] decompressed = randomArray();
    final byte[] compressed = compress(decompressed);
-    assertArrayEquals(compressed, copyCompressedData(compressed));
+    assertArrayEquals(compressed, copyCompressedData(compressed, decompressed.length));
  }

  public void test(byte[] decompressed) throws IOException {
    final byte[] compressed = compress(decompressed);
-    final byte[] restored = decompress(compressed);
+    final byte[] restored = decompress(compressed, decompressed.length);
    assertEquals(decompressed.length, restored.length);
-    assertArrayEquals(compressed, copyCompressedData(compressed));
+    assertArrayEquals(compressed, copyCompressedData(compressed, decompressed.length));
  }

  public void testEmptySequence() throws IOException {