LUCENE-9663: Add compression to terms dict from SortedSet/Sorted DocValues.

Closes #2302
2021-02-09 11:47:16 +01:00 · 2021-02-09 11:47:16 +01:00 · 5856c0f176
parent 227ef3b397
commit 5856c0f176
5 changed files with 481 additions and 21 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -186,6 +186,9 @@ Improvements
 * LUCENE-9674: Implement faster advance on VectorValues using binary search.
  (Anand Kotriwal, Mike Sokolov)

+* LUCENE-9663: Adding compression to terms dict from SortedSet/Sorted DocValues.
+  (Jaison Bi via Bruno Roustant)
+
 Bug fixes

 * LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen, 
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
@ -43,9 +43,11 @@ import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.ByteBuffersDataOutput;
 import org.apache.lucene.store.ByteBuffersIndexOutput;
 import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.ArrayUtil;
@ -66,6 +68,7 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
  IndexOutput data, meta;
  final int maxDoc;
  private final SegmentWriteState state;
+  private byte[] termsDictBuffer;

  /** expert: Creates a new writer */
  public Lucene80DocValuesConsumer(
@ -77,6 +80,9 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
      Lucene80DocValuesFormat.Mode mode)
      throws IOException {
    this.mode = mode;
+    if (Lucene80DocValuesFormat.Mode.BEST_COMPRESSION == this.mode) {
+      this.termsDictBuffer = new byte[1 << 14];
+    }
    boolean success = false;
    try {
      this.state = state;
@ -736,15 +742,26 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
  private void addTermsDict(SortedSetDocValues values) throws IOException {
    final long size = values.getValueCount();
    meta.writeVLong(size);
-    meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT);
+    boolean compress =
+        Lucene80DocValuesFormat.Mode.BEST_COMPRESSION == mode
+            && values.getValueCount()
+                > Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD;
+    int code, blockMask, shift;
+    if (compress) {
+      code = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_CODE;
+      blockMask = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_MASK;
+      shift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
+    } else {
+      code = shift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
+      blockMask = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK;
+    }

+    meta.writeInt(code);
+    meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
    ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput();
    ByteBuffersIndexOutput addressOutput =
        new ByteBuffersIndexOutput(addressBuffer, "temp", "temp");
-    meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
-    long numBlocks =
-        (size + Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK)
-            >>> Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
+    long numBlocks = (size + blockMask) >>> shift;
    DirectMonotonicWriter writer =
        DirectMonotonicWriter.getInstance(
            meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT);
@ -752,10 +769,24 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
    BytesRefBuilder previous = new BytesRefBuilder();
    long ord = 0;
    long start = data.getFilePointer();
-    int maxLength = 0;
+    int maxLength = 0, maxBlockLength = 0;
    TermsEnum iterator = values.termsEnum();
+
+    LZ4.FastCompressionHashTable ht = null;
+    ByteArrayDataOutput bufferedOutput = null;
+    if (compress) {
+      ht = new LZ4.FastCompressionHashTable();
+      bufferedOutput = new ByteArrayDataOutput(termsDictBuffer);
+    }
+
    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
-      if ((ord & Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) {
+      if ((ord & blockMask) == 0) {
+        if (compress && bufferedOutput.getPosition() > 0) {
+          maxBlockLength =
+              Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
+          bufferedOutput.reset(termsDictBuffer);
+        }
+
        writer.add(data.getFilePointer() - start);
        data.writeVInt(term.length);
        data.writeBytes(term.bytes, term.offset, term.length);
@ -763,22 +794,40 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
        final int prefixLength = StringHelper.bytesDifference(previous.get(), term);
        final int suffixLength = term.length - prefixLength;
        assert suffixLength > 0; // terms are unique
-
-        data.writeByte((byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4)));
+        DataOutput blockOutput;
+        if (compress) {
+          // Will write (suffixLength + 1 byte + 2 vint) bytes. Grow the buffer in need.
+          bufferedOutput = maybeGrowBuffer(bufferedOutput, suffixLength + 11);
+          blockOutput = bufferedOutput;
+        } else {
+          blockOutput = data;
+        }
+        blockOutput.writeByte(
+            (byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4)));
        if (prefixLength >= 15) {
-          data.writeVInt(prefixLength - 15);
+          blockOutput.writeVInt(prefixLength - 15);
        }
        if (suffixLength >= 16) {
-          data.writeVInt(suffixLength - 16);
+          blockOutput.writeVInt(suffixLength - 16);
        }
-        data.writeBytes(term.bytes, term.offset + prefixLength, term.length - prefixLength);
+        blockOutput.writeBytes(term.bytes, term.offset + prefixLength, suffixLength);
      }
      maxLength = Math.max(maxLength, term.length);
      previous.copyBytes(term);
      ++ord;
    }
+    // Compress and write out the last block
+    if (compress && bufferedOutput.getPosition() > 0) {
+      maxBlockLength =
+          Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
+    }
+
    writer.finish();
    meta.writeInt(maxLength);
+    if (compress) {
+      // Write one more int for storing max block length. For compressed terms dict only.
+      meta.writeInt(maxBlockLength);
+    }
    meta.writeLong(start);
    meta.writeLong(data.getFilePointer() - start);
    start = data.getFilePointer();
@ -790,6 +839,27 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
    writeTermsIndex(values);
  }

+  private int compressAndGetTermsDictBlockLength(
+      ByteArrayDataOutput bufferedOutput, LZ4.FastCompressionHashTable ht) throws IOException {
+    int uncompressedLength = bufferedOutput.getPosition();
+    data.writeVInt(uncompressedLength);
+    long before = data.getFilePointer();
+    LZ4.compress(termsDictBuffer, 0, uncompressedLength, data, ht);
+    int compressedLength = (int) (data.getFilePointer() - before);
+    // Block length will be used for creating buffer for decompression, one corner case is that
+    // compressed length might be bigger than un-compressed length, so just return the bigger one.
+    return Math.max(uncompressedLength, compressedLength);
+  }
+
+  private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) {
+    int pos = bufferedOutput.getPosition(), originalLength = termsDictBuffer.length;
+    if (pos + termLength >= originalLength - 1) {
+      termsDictBuffer = ArrayUtil.grow(termsDictBuffer, originalLength + termLength);
+      bufferedOutput = new ByteArrayDataOutput(termsDictBuffer, pos, termsDictBuffer.length - pos);
+    }
+    return bufferedOutput;
+  }
+
  private void writeTermsIndex(SortedSetDocValues values) throws IOException {
    final long size = values.getValueCount();
    meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java
@ -203,6 +203,15 @@ public final class Lucene80DocValuesFormat extends DocValuesFormat {
  static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
  static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;

+  static final int TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD = 32;
+  static final int TERMS_DICT_BLOCK_LZ4_SHIFT = 6;
+  static final int TERMS_DICT_BLOCK_LZ4_SIZE = 1 << TERMS_DICT_BLOCK_LZ4_SHIFT;
+  static final int TERMS_DICT_BLOCK_LZ4_MASK = TERMS_DICT_BLOCK_LZ4_SIZE - 1;
+  static final int TERMS_DICT_COMPRESSOR_LZ4_CODE = 1;
+  // Writing a special code so we know this is a LZ4-compressed block.
+  static final int TERMS_DICT_BLOCK_LZ4_CODE =
+      TERMS_DICT_BLOCK_LZ4_SHIFT << 16 | TERMS_DICT_COMPRESSOR_LZ4_CODE;
+
  static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10;
  static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
  static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
@ -38,7 +38,9 @@ import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.TermsEnum.SeekStatus;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.RandomAccessInput;
 import org.apache.lucene.util.BytesRef;
@ -285,12 +287,24 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close

  private static void readTermDict(IndexInput meta, TermsDictEntry entry) throws IOException {
    entry.termsDictSize = meta.readVLong();
-    entry.termsDictBlockShift = meta.readInt();
+    int termsDictBlockCode = meta.readInt();
+    if (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_CODE == termsDictBlockCode) {
+      // This is a LZ4 compressed block.
+      entry.compressed = true;
+      entry.termsDictBlockShift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
+    } else {
+      entry.termsDictBlockShift = termsDictBlockCode;
+    }
+
    final int blockShift = meta.readInt();
    final long addressesSize =
        (entry.termsDictSize + (1L << entry.termsDictBlockShift) - 1) >>> entry.termsDictBlockShift;
    entry.termsAddressesMeta = DirectMonotonicReader.loadMeta(meta, addressesSize, blockShift);
    entry.maxTermLength = meta.readInt();
+    // Read one more int for compressed term dict.
+    if (entry.compressed) {
+      entry.maxBlockLength = meta.readInt();
+    }
    entry.termsDataOffset = meta.readLong();
    entry.termsDataLength = meta.readLong();
    entry.termsAddressesOffset = meta.readLong();
@ -375,6 +389,9 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
    long termsIndexLength;
    long termsIndexAddressesOffset;
    long termsIndexAddressesLength;
+
+    boolean compressed;
+    int maxBlockLength;
  }

  private static class SortedEntry extends TermsDictEntry {
@ -1149,6 +1166,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
  }

  private static class TermsDict extends BaseTermsEnum {
+    static final int LZ4_DECOMPRESSOR_PADDING = 7;

    final TermsDictEntry entry;
    final LongValues blockAddresses;
@ -1159,6 +1177,11 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
    final BytesRef term;
    long ord = -1;

+    BytesRef blockBuffer = null;
+    ByteArrayDataInput blockInput = null;
+    long currentCompressedBlockStart = -1;
+    long currentCompressedBlockEnd = -1;
+
    TermsDict(TermsDictEntry entry, IndexInput data) throws IOException {
      this.entry = entry;
      RandomAccessInput addressesSlice =
@ -1172,6 +1195,12 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
          DirectMonotonicReader.getInstance(entry.termsIndexAddressesMeta, indexAddressesSlice);
      indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength);
      term = new BytesRef(entry.maxTermLength);
+
+      if (entry.compressed) {
+        // add 7 padding bytes can help decompression run faster.
+        int bufferSize = entry.maxBlockLength + LZ4_DECOMPRESSOR_PADDING;
+        blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize);
+      }
    }

    @Override
@ -1179,21 +1208,27 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
      if (++ord >= entry.termsDictSize) {
        return null;
      }
+
      if ((ord & blockMask) == 0L) {
-        term.length = bytes.readVInt();
-        bytes.readBytes(term.bytes, 0, term.length);
+        if (this.entry.compressed) {
+          decompressBlock();
+        } else {
+          term.length = bytes.readVInt();
+          bytes.readBytes(term.bytes, 0, term.length);
+        }
      } else {
-        final int token = Byte.toUnsignedInt(bytes.readByte());
+        DataInput input = this.entry.compressed ? blockInput : bytes;
+        final int token = Byte.toUnsignedInt(input.readByte());
        int prefixLength = token & 0x0F;
        int suffixLength = 1 + (token >>> 4);
        if (prefixLength == 15) {
-          prefixLength += bytes.readVInt();
+          prefixLength += input.readVInt();
        }
        if (suffixLength == 16) {
-          suffixLength += bytes.readVInt();
+          suffixLength += input.readVInt();
        }
        term.length = prefixLength + suffixLength;
-        bytes.readBytes(term.bytes, prefixLength, suffixLength);
+        input.readBytes(term.bytes, prefixLength, suffixLength);
      }
      return term;
    }
@ -1292,8 +1327,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
      final long blockAddress = blockAddresses.get(block);
      this.ord = block << entry.termsDictBlockShift;
      bytes.seek(blockAddress);
-      term.length = bytes.readVInt();
-      bytes.readBytes(term.bytes, 0, term.length);
+      if (this.entry.compressed) {
+        decompressBlock();
+      } else {
+        term.length = bytes.readVInt();
+        bytes.readBytes(term.bytes, 0, term.length);
+      }
+
      while (true) {
        int cmp = term.compareTo(text);
        if (cmp == 0) {
@ -1307,6 +1347,30 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close
      }
    }

+    private void decompressBlock() throws IOException {
+      // The first term is kept uncompressed, so no need to decompress block if only
+      // look up the first term when doing seek block.
+      term.length = bytes.readVInt();
+      bytes.readBytes(term.bytes, 0, term.length);
+      long offset = bytes.getFilePointer();
+      if (offset < entry.termsDataLength - 1) {
+        // Avoid decompress again if we are reading a same block.
+        if (currentCompressedBlockStart != offset) {
+          int decompressLength = bytes.readVInt();
+          // Decompress the remaining of current block
+          LZ4.decompress(bytes, decompressLength, blockBuffer.bytes, 0);
+          currentCompressedBlockStart = offset;
+          currentCompressedBlockEnd = bytes.getFilePointer();
+        } else {
+          // Skip decompression but need to re-seek to block end.
+          bytes.seek(currentCompressedBlockEnd);
+        }
+
+        // Reset the buffer.
+        blockInput = new ByteArrayDataInput(blockBuffer.bytes, 0, blockBuffer.length);
+      }
+    }
+
    @Override
    public BytesRef term() throws IOException {
      return term;
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestDocValuesCompression.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestDocValuesCompression.java
@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene90.Lucene90Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestDocValuesCompression extends LuceneTestCase {
+  private final Codec bestSpeed = new Lucene90Codec(Lucene90Codec.Mode.BEST_SPEED);
+  private final Codec bestCompression = new Lucene90Codec(Lucene90Codec.Mode.BEST_COMPRESSION);
+
+  public void testTermsDictCompressionForLowCardinalityFields() throws IOException {
+    final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD - 1;
+    Set<String> valuesSet = new HashSet<>();
+    for (int i = 0; i < CARDINALITY; ++i) {
+      final int length = TestUtil.nextInt(random(), 10, 30);
+      String value = TestUtil.randomSimpleString(random(), length);
+      valuesSet.add(value);
+    }
+
+    List<String> values = new ArrayList<>(valuesSet);
+    long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values);
+    long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values);
+
+    // Ensure terms dict data was not compressed for low-cardinality fields.
+    assertEquals(sizeForBestSpeed, sizeForBestCompression);
+  }
+
+  public void testTermsDictCompressionForHighCardinalityFields() throws IOException {
+    final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
+    Set<String> valuesSet = new HashSet<>();
+    for (int i = 0; i < CARDINALITY; ++i) {
+      final int length = TestUtil.nextInt(random(), 10, 30);
+      String value = TestUtil.randomSimpleString(random(), length);
+      // Add common suffix for better compression ratio.
+      valuesSet.add(value + "_CommonPartBetterForCompression");
+    }
+
+    List<String> values = new ArrayList<>(valuesSet);
+    long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values);
+    long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values);
+
+    // Compression happened.
+    assertTrue(sizeForBestCompression < sizeForBestSpeed);
+  }
+
+  public void testReseekAfterSkipDecompression() throws IOException {
+    final int CARDINALITY = (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE << 1) + 11;
+    Set<String> valueSet = new HashSet<>(CARDINALITY);
+    for (int i = 0; i < CARDINALITY; i++) {
+      valueSet.add(TestUtil.randomSimpleString(random(), 64));
+    }
+    List<String> values = new ArrayList<>(valueSet);
+    Collections.sort(values);
+    // Create one non-existent value just between block-1 and block-2.
+    String nonexistentValue =
+        values.get(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE - 1)
+            + TestUtil.randomSimpleString(random(), 64, 128);
+    int docValues = values.size();
+
+    try (Directory directory = newDirectory()) {
+      Analyzer analyzer = new StandardAnalyzer();
+      IndexWriterConfig config = new IndexWriterConfig(analyzer);
+      config.setCodec(bestCompression);
+      config.setUseCompoundFile(false);
+      IndexWriter writer = new IndexWriter(directory, config);
+      for (int i = 0; i < 280; i++) {
+        Document doc = new Document();
+        doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
+        doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % docValues))));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      writer.forceMerge(1);
+      DirectoryReader dReader = DirectoryReader.open(writer);
+      writer.close();
+
+      LeafReader reader = getOnlyLeafReader(dReader);
+      // Check values count.
+      SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv");
+      assertEquals(docValues, ssdvMulti.getValueCount());
+
+      // Seek to first block.
+      int ord1 = ssdvMulti.lookupTerm(new BytesRef(values.get(0)));
+      assertTrue(ord1 >= 0);
+      int ord2 = ssdvMulti.lookupTerm(new BytesRef(values.get(1)));
+      assertTrue(ord2 >= ord1);
+      // Ensure re-seek logic is correct after skip-decompression.
+      int nonexistentOrd2 = ssdvMulti.lookupTerm(new BytesRef(nonexistentValue));
+      assertTrue(nonexistentOrd2 < 0);
+      dReader.close();
+    }
+  }
+
+  public void testLargeTermsCompression() throws IOException {
+    final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
+    Set<String> valuesSet = new HashSet<>();
+    for (int i = 0; i < CARDINALITY; ++i) {
+      final int length = TestUtil.nextInt(random(), 512, 1024);
+      valuesSet.add(TestUtil.randomSimpleString(random(), length));
+    }
+    int valuesCount = valuesSet.size();
+    List<String> values = new ArrayList<>(valuesSet);
+
+    try (Directory directory = newDirectory()) {
+      Analyzer analyzer = new StandardAnalyzer();
+      IndexWriterConfig config = new IndexWriterConfig(analyzer);
+      config.setCodec(bestCompression);
+      config.setUseCompoundFile(false);
+      IndexWriter writer = new IndexWriter(directory, config);
+      for (int i = 0; i < 256; i++) {
+        Document doc = new Document();
+        doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
+        doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      writer.forceMerge(1);
+      DirectoryReader ireader = DirectoryReader.open(writer);
+      writer.close();
+
+      LeafReader reader = getOnlyLeafReader(ireader);
+      // Check values count.
+      SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv");
+      assertEquals(valuesCount, ssdvMulti.getValueCount());
+      ireader.close();
+    }
+  }
+
+  // Ensure the old segment can be merged together with the new compressed segment.
+  public void testMergeWithUncompressedSegment() throws IOException {
+    final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
+    Set<String> valuesSet = new HashSet<>();
+    for (int i = 0; i < CARDINALITY; ++i) {
+      final int length = TestUtil.nextInt(random(), 10, 30);
+      // Add common suffix for better compression ratio.
+      valuesSet.add(TestUtil.randomSimpleString(random(), length));
+    }
+    List<String> values = new ArrayList<>(valuesSet);
+    int valuesCount = values.size();
+
+    try (Directory directory = newDirectory()) {
+      // 1. Write 256 documents without terms dict compression.
+      Analyzer analyzer = new StandardAnalyzer();
+      IndexWriterConfig config = new IndexWriterConfig(analyzer);
+      config.setCodec(bestSpeed);
+      config.setUseCompoundFile(false);
+      IndexWriter writer = new IndexWriter(directory, config);
+      for (int i = 0; i < 256; i++) {
+        Document doc = new Document();
+        doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
+        doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount))));
+        doc.add(
+            new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 1) % valuesCount))));
+        doc.add(
+            new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 2) % valuesCount))));
+        doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      DirectoryReader ireader = DirectoryReader.open(writer);
+      assertEquals(256, ireader.numDocs());
+      LeafReader reader = getOnlyLeafReader(ireader);
+      SortedSetDocValues ssdv = reader.getSortedSetDocValues("ssdv");
+      assertEquals(valuesCount, ssdv.getValueCount());
+      SortedDocValues sdv = reader.getSortedDocValues("sdv");
+      assertEquals(valuesCount, sdv.getValueCount());
+      ireader.close();
+      writer.close();
+
+      // 2. Add another 100 documents, and enabling terms dict compression.
+      config = new IndexWriterConfig(analyzer);
+      config.setCodec(bestCompression);
+      config.setUseCompoundFile(false);
+      writer = new IndexWriter(directory, config);
+      // Add 2 new values.
+      valuesSet.add(TestUtil.randomSimpleString(random(), 10));
+      valuesSet.add(TestUtil.randomSimpleString(random(), 10));
+      values = new ArrayList<>(valuesSet);
+      valuesCount = valuesSet.size();
+
+      for (int i = 256; i < 356; i++) {
+        Document doc = new Document();
+        doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
+        doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount))));
+        doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      writer.forceMerge(1);
+      ireader = DirectoryReader.open(writer);
+      assertEquals(356, ireader.numDocs());
+      reader = getOnlyLeafReader(ireader);
+      ssdv = reader.getSortedSetDocValues("ssdv");
+      assertEquals(valuesCount, ssdv.getValueCount());
+      ireader.close();
+      writer.close();
+    }
+  }
+
+  private static long writeAndGetDocValueFileSize(Codec codec, List<String> values)
+      throws IOException {
+    int valuesCount = values.size();
+    long dvdFileSize = -1;
+    try (Directory directory = newDirectory()) {
+      Analyzer analyzer = new StandardAnalyzer();
+      IndexWriterConfig config = new IndexWriterConfig(analyzer);
+      config.setCodec(codec);
+      config.setUseCompoundFile(false);
+      IndexWriter writer = new IndexWriter(directory, config);
+      for (int i = 0; i < 256; i++) {
+        Document doc = new Document();
+        doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
+        // Multi value sorted-set field.
+        doc.add(
+            new SortedSetDocValuesField("ssdv_multi_", new BytesRef(values.get(i % valuesCount))));
+        doc.add(
+            new SortedSetDocValuesField(
+                "ssdv_multi_", new BytesRef(values.get((i + 1) % valuesCount))));
+        doc.add(
+            new SortedSetDocValuesField(
+                "ssdv_multi_", new BytesRef(values.get((i + 2) % valuesCount))));
+        // Single value sorted-set field.
+        doc.add(
+            new SortedSetDocValuesField("ssdv_single_", new BytesRef(values.get(i % valuesCount))));
+        // Sorted field.
+        doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      writer.forceMerge(1);
+      DirectoryReader ireader = DirectoryReader.open(writer);
+      writer.close();
+
+      LeafReader reader = getOnlyLeafReader(ireader);
+      // Check values count.
+      SortedSetDocValues ssdvMulti = reader.getSortedSetDocValues("ssdv_multi_");
+      assertEquals(valuesCount, ssdvMulti.getValueCount());
+      for (int i = 0; i < valuesCount; i++) {
+        BytesRef term = ssdvMulti.lookupOrd(i);
+        assertTrue(term.bytes.length > 0);
+      }
+      for (int i = 0; i < valuesCount; i++) {
+        for (int j = 0; j < 3; j++) {
+          assertTrue(ssdvMulti.lookupTerm(new BytesRef(values.get((i + j) % valuesCount))) >= 0);
+        }
+      }
+
+      SortedSetDocValues ssdvSingle = reader.getSortedSetDocValues("ssdv_single_");
+      assertEquals(valuesCount, ssdvSingle.getValueCount());
+      for (int i = 0; i < valuesCount; i++) {
+        assertTrue(ssdvSingle.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0);
+      }
+
+      SortedDocValues sdv = reader.getSortedDocValues("sdv");
+      assertEquals(valuesCount, sdv.getValueCount());
+      for (int i = 0; i < valuesCount; i++) {
+        assertTrue(sdv.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0);
+      }
+
+      dvdFileSize = docValueFileSize(directory);
+      assertTrue(dvdFileSize > 0);
+      ireader.close();
+    }
+
+    return dvdFileSize;
+  }
+
+  static long docValueFileSize(Directory d) throws IOException {
+    for (String file : d.listAll()) {
+      if (file.endsWith(Lucene80DocValuesFormat.DATA_EXTENSION)) {
+        return d.fileLength(file);
+      }
+    }
+    return -1;
+  }
+}