LUCENE-10536: Slightly better compression of doc values' terms dictionaries. (#838)

Doc values terms dictionaries keep the first term of each block uncompressed so that they can somewhat efficiently perform binary searches across blocks. Suffixes of the other 63 terms are compressed together using LZ4 to leverage redundancy across suffixes. This change improves compression a bit by using the first (uncompressed) term of each block as a dictionary when compressing suffixes of the 63 other terms. This helps with compressing the first few suffixes when there's not much context yet that can be leveraged to find duplicates.
2022-05-12 10:32:58 +02:00 · 2022-05-12 10:32:58 +02:00 · 8f89db8048
parent 96036bca9f
commit 8f89db8048
4 changed files with 63 additions and 19 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -141,6 +141,10 @@ Optimizations
 * LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
  and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
 * LUCENE-10536: Doc values terms dictionaries now use the first (uncompressed)
  term of each block as a dictionary when compressing suffixes of the other 63
  terms of the block. (Adrien Grand)
 * LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader. 
  (Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida)
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
@ -565,18 +565,26 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
    LZ4.FastCompressionHashTable ht = new LZ4.FastCompressionHashTable();
    ByteArrayDataOutput bufferedOutput = new ByteArrayDataOutput(termsDictBuffer);
    int dictLength = 0;
    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
      if ((ord & blockMask) == 0) {
-        if (bufferedOutput.getPosition() > 0) {
+        if (ord != 0) {
-          maxBlockLength =
+          // flush the previous block
-              Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
+          final int uncompressedLength =
              compressAndGetTermsDictBlockLength(bufferedOutput, dictLength, ht);
          maxBlockLength = Math.max(maxBlockLength, uncompressedLength);
          bufferedOutput.reset(termsDictBuffer);
        }
        writer.add(data.getFilePointer() - start);
        // Write the first term both to the index output, and to the buffer where we'll use it as a
        // dictionary for compression
        data.writeVInt(term.length);
        data.writeBytes(term.bytes, term.offset, term.length);
        bufferedOutput = maybeGrowBuffer(bufferedOutput, term.length);
        bufferedOutput.writeBytes(term.bytes, term.offset, term.length);
        dictLength = term.length;
      } else {
        final int prefixLength = StringHelper.bytesDifference(previous.get(), term);
        final int suffixLength = term.length - prefixLength;
@ -598,9 +606,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
      ++ord;
    }
    // Compress and write out the last block
-    if (bufferedOutput.getPosition() > 0) {
+    if (bufferedOutput.getPosition() > dictLength) {
-      maxBlockLength =
+      final int uncompressedLength =
-          Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
+          compressAndGetTermsDictBlockLength(bufferedOutput, dictLength, ht);
      maxBlockLength = Math.max(maxBlockLength, uncompressedLength);
    }
    writer.finish();
@ -619,15 +628,12 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
  }
  private int compressAndGetTermsDictBlockLength(
-      ByteArrayDataOutput bufferedOutput, LZ4.FastCompressionHashTable ht) throws IOException {
+      ByteArrayDataOutput bufferedOutput, int dictLength, LZ4.FastCompressionHashTable ht)
-    int uncompressedLength = bufferedOutput.getPosition();
+      throws IOException {
    int uncompressedLength = bufferedOutput.getPosition() - dictLength;
    data.writeVInt(uncompressedLength);
-    long before = data.getFilePointer();
+    LZ4.compressWithDictionary(termsDictBuffer, 0, dictLength, uncompressedLength, data, ht);
-    LZ4.compress(termsDictBuffer, 0, uncompressedLength, data, ht);
+    return uncompressedLength;
    int compressedLength = (int) (data.getFilePointer() - before);
    // Block length will be used for creating buffer for decompression, one corner case is that
    // compressed length might be bigger than un-compressed length, so just return the bigger one.
    return Math.max(uncompressedLength, compressedLength);
  }
  private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
@ -1076,8 +1076,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
      indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength);
      term = new BytesRef(entry.maxTermLength);
      // add the max term length for the dictionary
      // add 7 padding bytes can help decompression run faster.
-      int bufferSize = entry.maxBlockLength + LZ4_DECOMPRESSOR_PADDING;
+      int bufferSize = entry.maxBlockLength + entry.maxTermLength + LZ4_DECOMPRESSOR_PADDING;
      blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize);
    }
@ -1235,9 +1236,11 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
      if (offset < entry.termsDataLength - 1) {
        // Avoid decompress again if we are reading a same block.
        if (currentCompressedBlockStart != offset) {
-          int decompressLength = bytes.readVInt();
+          blockBuffer.offset = term.length;
-          // Decompress the remaining of current block
+          blockBuffer.length = bytes.readVInt();
-          LZ4.decompress(bytes, decompressLength, blockBuffer.bytes, 0);
+          // Decompress the remaining of current block, using the first term as a dictionary
          System.arraycopy(term.bytes, 0, blockBuffer.bytes, 0, blockBuffer.offset);
          LZ4.decompress(bytes, blockBuffer.length, blockBuffer.bytes, blockBuffer.offset);
          currentCompressedBlockStart = offset;
          currentCompressedBlockEnd = bytes.getFilePointer();
        } else {
@ -1246,7 +1249,8 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
        }
        // Reset the buffer.
-        blockInput = new ByteArrayDataInput(blockBuffer.bytes, 0, blockBuffer.length);
+        blockInput =
            new ByteArrayDataInput(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length);
      }
    }
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
@ -928,4 +928,34 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
      assertEquals(terms.get(i), te.term());
    }
  }
  // Exercise the logic that leverages the first term of a block as a dictionary for suffixes of
  // other terms
  public void testTermsEnumDictionary() throws IOException {
    Directory directory = newDirectory();
    IndexWriterConfig conf = newIndexWriterConfig();
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf);
    Document doc = new Document();
    SortedDocValuesField field = new SortedDocValuesField("field", new BytesRef("abc0defghijkl"));
    doc.add(field);
    iwriter.addDocument(doc);
    field.setBytesValue(new BytesRef("abc1defghijkl"));
    iwriter.addDocument(doc);
    field.setBytesValue(new BytesRef("abc2defghijkl"));
    iwriter.addDocument(doc);
    iwriter.forceMerge(1);
    iwriter.close();
    IndexReader reader = DirectoryReader.open(directory);
    LeafReader leafReader = getOnlyLeafReader(reader);
    SortedDocValues values = leafReader.getSortedDocValues("field");
    TermsEnum termsEnum = values.termsEnum();
    assertEquals(new BytesRef("abc0defghijkl"), termsEnum.next());
    assertEquals(new BytesRef("abc1defghijkl"), termsEnum.next());
    assertEquals(new BytesRef("abc2defghijkl"), termsEnum.next());
    assertNull(termsEnum.next());
    reader.close();
    directory.close();
  }
 }