From 5856c0f176c27b9ea683c63439960dd41e3e45f2 Mon Sep 17 00:00:00 2001 From: jaison Date: Tue, 9 Feb 2021 11:47:16 +0100 Subject: [PATCH] LUCENE-9663: Add compression to terms dict from SortedSet/Sorted DocValues. Closes #2302 --- lucene/CHANGES.txt | 3 + .../lucene80/Lucene80DocValuesConsumer.java | 94 +++++- .../lucene80/Lucene80DocValuesFormat.java | 9 + .../lucene80/Lucene80DocValuesProducer.java | 82 ++++- .../lucene80/TestDocValuesCompression.java | 314 ++++++++++++++++++ 5 files changed, 481 insertions(+), 21 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestDocValuesCompression.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 337f3769fdc..dfb6946d66d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -186,6 +186,9 @@ Improvements * LUCENE-9674: Implement faster advance on VectorValues using binary search. (Anand Kotriwal, Mike Sokolov) +* LUCENE-9663: Adding compression to terms dict from SortedSet/Sorted DocValues. + (Jaison Bi via Bruno Roustant) + Bug fixes * LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java index 761e2d9afb4..118ba037a33 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java @@ -43,9 +43,11 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.ByteBuffersIndexOutput; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; @@ -66,6 +68,7 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close IndexOutput data, meta; final int maxDoc; private final SegmentWriteState state; + private byte[] termsDictBuffer; /** expert: Creates a new writer */ public Lucene80DocValuesConsumer( @@ -77,6 +80,9 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close Lucene80DocValuesFormat.Mode mode) throws IOException { this.mode = mode; + if (Lucene80DocValuesFormat.Mode.BEST_COMPRESSION == this.mode) { + this.termsDictBuffer = new byte[1 << 14]; + } boolean success = false; try { this.state = state; @@ -736,15 +742,26 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close private void addTermsDict(SortedSetDocValues values) throws IOException { final long size = values.getValueCount(); meta.writeVLong(size); - meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT); + boolean compress = + Lucene80DocValuesFormat.Mode.BEST_COMPRESSION == mode + && values.getValueCount() + > Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD; + int code, blockMask, shift; + if (compress) { + code = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_CODE; + blockMask = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_MASK; + shift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT; + } else { + code = shift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT; + blockMask = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK; + } + meta.writeInt(code); + meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT); ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput(); ByteBuffersIndexOutput addressOutput = new ByteBuffersIndexOutput(addressBuffer, "temp", "temp"); - meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT); - long numBlocks = - (size + Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) - >>> Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT; + long numBlocks = (size + blockMask) >>> shift; DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance( meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT); @@ -752,10 +769,24 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close BytesRefBuilder previous = new BytesRefBuilder(); long ord = 0; long start = data.getFilePointer(); - int maxLength = 0; + int maxLength = 0, maxBlockLength = 0; TermsEnum iterator = values.termsEnum(); + + LZ4.FastCompressionHashTable ht = null; + ByteArrayDataOutput bufferedOutput = null; + if (compress) { + ht = new LZ4.FastCompressionHashTable(); + bufferedOutput = new ByteArrayDataOutput(termsDictBuffer); + } + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { - if ((ord & Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) { + if ((ord & blockMask) == 0) { + if (compress && bufferedOutput.getPosition() > 0) { + maxBlockLength = + Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht)); + bufferedOutput.reset(termsDictBuffer); + } + writer.add(data.getFilePointer() - start); data.writeVInt(term.length); data.writeBytes(term.bytes, term.offset, term.length); @@ -763,22 +794,40 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close final int prefixLength = StringHelper.bytesDifference(previous.get(), term); final int suffixLength = term.length - prefixLength; assert suffixLength > 0; // terms are unique - - data.writeByte((byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4))); + DataOutput blockOutput; + if (compress) { + // Will write (suffixLength + 1 byte + 2 vint) bytes. Grow the buffer in need. + bufferedOutput = maybeGrowBuffer(bufferedOutput, suffixLength + 11); + blockOutput = bufferedOutput; + } else { + blockOutput = data; + } + blockOutput.writeByte( + (byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4))); if (prefixLength >= 15) { - data.writeVInt(prefixLength - 15); + blockOutput.writeVInt(prefixLength - 15); } if (suffixLength >= 16) { - data.writeVInt(suffixLength - 16); + blockOutput.writeVInt(suffixLength - 16); } - data.writeBytes(term.bytes, term.offset + prefixLength, term.length - prefixLength); + blockOutput.writeBytes(term.bytes, term.offset + prefixLength, suffixLength); } maxLength = Math.max(maxLength, term.length); previous.copyBytes(term); ++ord; } + // Compress and write out the last block + if (compress && bufferedOutput.getPosition() > 0) { + maxBlockLength = + Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht)); + } + writer.finish(); meta.writeInt(maxLength); + if (compress) { + // Write one more int for storing max block length. For compressed terms dict only. + meta.writeInt(maxBlockLength); + } meta.writeLong(start); meta.writeLong(data.getFilePointer() - start); start = data.getFilePointer(); @@ -790,6 +839,27 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close writeTermsIndex(values); } + private int compressAndGetTermsDictBlockLength( + ByteArrayDataOutput bufferedOutput, LZ4.FastCompressionHashTable ht) throws IOException { + int uncompressedLength = bufferedOutput.getPosition(); + data.writeVInt(uncompressedLength); + long before = data.getFilePointer(); + LZ4.compress(termsDictBuffer, 0, uncompressedLength, data, ht); + int compressedLength = (int) (data.getFilePointer() - before); + // Block length will be used for creating buffer for decompression, one corner case is that + // compressed length might be bigger than un-compressed length, so just return the bigger one. + return Math.max(uncompressedLength, compressedLength); + } + + private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) { + int pos = bufferedOutput.getPosition(), originalLength = termsDictBuffer.length; + if (pos + termLength >= originalLength - 1) { + termsDictBuffer = ArrayUtil.grow(termsDictBuffer, originalLength + termLength); + bufferedOutput = new ByteArrayDataOutput(termsDictBuffer, pos, termsDictBuffer.length - pos); + } + return bufferedOutput; + } + private void writeTermsIndex(SortedSetDocValues values) throws IOException { final long size = values.getValueCount(); meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java index daa41c547e3..c8d59cc2870 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java @@ -203,6 +203,15 @@ public final class Lucene80DocValuesFormat extends DocValuesFormat { static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT; static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1; + static final int TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD = 32; + static final int TERMS_DICT_BLOCK_LZ4_SHIFT = 6; + static final int TERMS_DICT_BLOCK_LZ4_SIZE = 1 << TERMS_DICT_BLOCK_LZ4_SHIFT; + static final int TERMS_DICT_BLOCK_LZ4_MASK = TERMS_DICT_BLOCK_LZ4_SIZE - 1; + static final int TERMS_DICT_COMPRESSOR_LZ4_CODE = 1; + // Writing a special code so we know this is a LZ4-compressed block. + static final int TERMS_DICT_BLOCK_LZ4_CODE = + TERMS_DICT_BLOCK_LZ4_SHIFT << 16 | TERMS_DICT_COMPRESSOR_LZ4_CODE; + static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10; static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT; static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java index db579a0dc21..5813435d16a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java @@ -38,7 +38,9 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.BytesRef; @@ -285,12 +287,24 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close private static void readTermDict(IndexInput meta, TermsDictEntry entry) throws IOException { entry.termsDictSize = meta.readVLong(); - entry.termsDictBlockShift = meta.readInt(); + int termsDictBlockCode = meta.readInt(); + if (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_CODE == termsDictBlockCode) { + // This is a LZ4 compressed block. + entry.compressed = true; + entry.termsDictBlockShift = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT; + } else { + entry.termsDictBlockShift = termsDictBlockCode; + } + final int blockShift = meta.readInt(); final long addressesSize = (entry.termsDictSize + (1L << entry.termsDictBlockShift) - 1) >>> entry.termsDictBlockShift; entry.termsAddressesMeta = DirectMonotonicReader.loadMeta(meta, addressesSize, blockShift); entry.maxTermLength = meta.readInt(); + // Read one more int for compressed term dict. + if (entry.compressed) { + entry.maxBlockLength = meta.readInt(); + } entry.termsDataOffset = meta.readLong(); entry.termsDataLength = meta.readLong(); entry.termsAddressesOffset = meta.readLong(); @@ -375,6 +389,9 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close long termsIndexLength; long termsIndexAddressesOffset; long termsIndexAddressesLength; + + boolean compressed; + int maxBlockLength; } private static class SortedEntry extends TermsDictEntry { @@ -1149,6 +1166,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close } private static class TermsDict extends BaseTermsEnum { + static final int LZ4_DECOMPRESSOR_PADDING = 7; final TermsDictEntry entry; final LongValues blockAddresses; @@ -1159,6 +1177,11 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close final BytesRef term; long ord = -1; + BytesRef blockBuffer = null; + ByteArrayDataInput blockInput = null; + long currentCompressedBlockStart = -1; + long currentCompressedBlockEnd = -1; + TermsDict(TermsDictEntry entry, IndexInput data) throws IOException { this.entry = entry; RandomAccessInput addressesSlice = @@ -1172,6 +1195,12 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close DirectMonotonicReader.getInstance(entry.termsIndexAddressesMeta, indexAddressesSlice); indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength); term = new BytesRef(entry.maxTermLength); + + if (entry.compressed) { + // add 7 padding bytes can help decompression run faster. + int bufferSize = entry.maxBlockLength + LZ4_DECOMPRESSOR_PADDING; + blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize); + } } @Override @@ -1179,21 +1208,27 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close if (++ord >= entry.termsDictSize) { return null; } + if ((ord & blockMask) == 0L) { - term.length = bytes.readVInt(); - bytes.readBytes(term.bytes, 0, term.length); + if (this.entry.compressed) { + decompressBlock(); + } else { + term.length = bytes.readVInt(); + bytes.readBytes(term.bytes, 0, term.length); + } } else { - final int token = Byte.toUnsignedInt(bytes.readByte()); + DataInput input = this.entry.compressed ? blockInput : bytes; + final int token = Byte.toUnsignedInt(input.readByte()); int prefixLength = token & 0x0F; int suffixLength = 1 + (token >>> 4); if (prefixLength == 15) { - prefixLength += bytes.readVInt(); + prefixLength += input.readVInt(); } if (suffixLength == 16) { - suffixLength += bytes.readVInt(); + suffixLength += input.readVInt(); } term.length = prefixLength + suffixLength; - bytes.readBytes(term.bytes, prefixLength, suffixLength); + input.readBytes(term.bytes, prefixLength, suffixLength); } return term; } @@ -1292,8 +1327,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close final long blockAddress = blockAddresses.get(block); this.ord = block << entry.termsDictBlockShift; bytes.seek(blockAddress); - term.length = bytes.readVInt(); - bytes.readBytes(term.bytes, 0, term.length); + if (this.entry.compressed) { + decompressBlock(); + } else { + term.length = bytes.readVInt(); + bytes.readBytes(term.bytes, 0, term.length); + } + while (true) { int cmp = term.compareTo(text); if (cmp == 0) { @@ -1307,6 +1347,30 @@ final class Lucene80DocValuesProducer extends DocValuesProducer implements Close } } + private void decompressBlock() throws IOException { + // The first term is kept uncompressed, so no need to decompress block if only + // look up the first term when doing seek block. + term.length = bytes.readVInt(); + bytes.readBytes(term.bytes, 0, term.length); + long offset = bytes.getFilePointer(); + if (offset < entry.termsDataLength - 1) { + // Avoid decompress again if we are reading a same block. + if (currentCompressedBlockStart != offset) { + int decompressLength = bytes.readVInt(); + // Decompress the remaining of current block + LZ4.decompress(bytes, decompressLength, blockBuffer.bytes, 0); + currentCompressedBlockStart = offset; + currentCompressedBlockEnd = bytes.getFilePointer(); + } else { + // Skip decompression but need to re-seek to block end. + bytes.seek(currentCompressedBlockEnd); + } + + // Reset the buffer. + blockInput = new ByteArrayDataInput(blockBuffer.bytes, 0, blockBuffer.length); + } + } + @Override public BytesRef term() throws IOException { return term; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestDocValuesCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestDocValuesCompression.java new file mode 100644 index 00000000000..fb5b66a3ecb --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestDocValuesCompression.java @@ -0,0 +1,314 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene80; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene90.Lucene90Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestDocValuesCompression extends LuceneTestCase { + private final Codec bestSpeed = new Lucene90Codec(Lucene90Codec.Mode.BEST_SPEED); + private final Codec bestCompression = new Lucene90Codec(Lucene90Codec.Mode.BEST_COMPRESSION); + + public void testTermsDictCompressionForLowCardinalityFields() throws IOException { + final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD - 1; + Set valuesSet = new HashSet<>(); + for (int i = 0; i < CARDINALITY; ++i) { + final int length = TestUtil.nextInt(random(), 10, 30); + String value = TestUtil.randomSimpleString(random(), length); + valuesSet.add(value); + } + + List values = new ArrayList<>(valuesSet); + long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values); + long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values); + + // Ensure terms dict data was not compressed for low-cardinality fields. + assertEquals(sizeForBestSpeed, sizeForBestCompression); + } + + public void testTermsDictCompressionForHighCardinalityFields() throws IOException { + final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1; + Set valuesSet = new HashSet<>(); + for (int i = 0; i < CARDINALITY; ++i) { + final int length = TestUtil.nextInt(random(), 10, 30); + String value = TestUtil.randomSimpleString(random(), length); + // Add common suffix for better compression ratio. + valuesSet.add(value + "_CommonPartBetterForCompression"); + } + + List values = new ArrayList<>(valuesSet); + long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values); + long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values); + + // Compression happened. + assertTrue(sizeForBestCompression < sizeForBestSpeed); + } + + public void testReseekAfterSkipDecompression() throws IOException { + final int CARDINALITY = (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE << 1) + 11; + Set valueSet = new HashSet<>(CARDINALITY); + for (int i = 0; i < CARDINALITY; i++) { + valueSet.add(TestUtil.randomSimpleString(random(), 64)); + } + List values = new ArrayList<>(valueSet); + Collections.sort(values); + // Create one non-existent value just between block-1 and block-2. + String nonexistentValue = + values.get(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE - 1) + + TestUtil.randomSimpleString(random(), 64, 128); + int docValues = values.size(); + + try (Directory directory = newDirectory()) { + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setCodec(bestCompression); + config.setUseCompoundFile(false); + IndexWriter writer = new IndexWriter(directory, config); + for (int i = 0; i < 280; i++) { + Document doc = new Document(); + doc.add(new StringField("id", "Doc" + i, Field.Store.NO)); + doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % docValues)))); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + DirectoryReader dReader = DirectoryReader.open(writer); + writer.close(); + + LeafReader reader = getOnlyLeafReader(dReader); + // Check values count. + SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv"); + assertEquals(docValues, ssdvMulti.getValueCount()); + + // Seek to first block. + int ord1 = ssdvMulti.lookupTerm(new BytesRef(values.get(0))); + assertTrue(ord1 >= 0); + int ord2 = ssdvMulti.lookupTerm(new BytesRef(values.get(1))); + assertTrue(ord2 >= ord1); + // Ensure re-seek logic is correct after skip-decompression. + int nonexistentOrd2 = ssdvMulti.lookupTerm(new BytesRef(nonexistentValue)); + assertTrue(nonexistentOrd2 < 0); + dReader.close(); + } + } + + public void testLargeTermsCompression() throws IOException { + final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1; + Set valuesSet = new HashSet<>(); + for (int i = 0; i < CARDINALITY; ++i) { + final int length = TestUtil.nextInt(random(), 512, 1024); + valuesSet.add(TestUtil.randomSimpleString(random(), length)); + } + int valuesCount = valuesSet.size(); + List values = new ArrayList<>(valuesSet); + + try (Directory directory = newDirectory()) { + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setCodec(bestCompression); + config.setUseCompoundFile(false); + IndexWriter writer = new IndexWriter(directory, config); + for (int i = 0; i < 256; i++) { + Document doc = new Document(); + doc.add(new StringField("id", "Doc" + i, Field.Store.NO)); + doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount)))); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + DirectoryReader ireader = DirectoryReader.open(writer); + writer.close(); + + LeafReader reader = getOnlyLeafReader(ireader); + // Check values count. + SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv"); + assertEquals(valuesCount, ssdvMulti.getValueCount()); + ireader.close(); + } + } + + // Ensure the old segment can be merged together with the new compressed segment. + public void testMergeWithUncompressedSegment() throws IOException { + final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1; + Set valuesSet = new HashSet<>(); + for (int i = 0; i < CARDINALITY; ++i) { + final int length = TestUtil.nextInt(random(), 10, 30); + // Add common suffix for better compression ratio. + valuesSet.add(TestUtil.randomSimpleString(random(), length)); + } + List values = new ArrayList<>(valuesSet); + int valuesCount = values.size(); + + try (Directory directory = newDirectory()) { + // 1. Write 256 documents without terms dict compression. + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setCodec(bestSpeed); + config.setUseCompoundFile(false); + IndexWriter writer = new IndexWriter(directory, config); + for (int i = 0; i < 256; i++) { + Document doc = new Document(); + doc.add(new StringField("id", "Doc" + i, Field.Store.NO)); + doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount)))); + doc.add( + new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 1) % valuesCount)))); + doc.add( + new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 2) % valuesCount)))); + doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount)))); + writer.addDocument(doc); + } + writer.commit(); + DirectoryReader ireader = DirectoryReader.open(writer); + assertEquals(256, ireader.numDocs()); + LeafReader reader = getOnlyLeafReader(ireader); + SortedSetDocValues ssdv = reader.getSortedSetDocValues("ssdv"); + assertEquals(valuesCount, ssdv.getValueCount()); + SortedDocValues sdv = reader.getSortedDocValues("sdv"); + assertEquals(valuesCount, sdv.getValueCount()); + ireader.close(); + writer.close(); + + // 2. Add another 100 documents, and enabling terms dict compression. + config = new IndexWriterConfig(analyzer); + config.setCodec(bestCompression); + config.setUseCompoundFile(false); + writer = new IndexWriter(directory, config); + // Add 2 new values. + valuesSet.add(TestUtil.randomSimpleString(random(), 10)); + valuesSet.add(TestUtil.randomSimpleString(random(), 10)); + values = new ArrayList<>(valuesSet); + valuesCount = valuesSet.size(); + + for (int i = 256; i < 356; i++) { + Document doc = new Document(); + doc.add(new StringField("id", "Doc" + i, Field.Store.NO)); + doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount)))); + doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount)))); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + ireader = DirectoryReader.open(writer); + assertEquals(356, ireader.numDocs()); + reader = getOnlyLeafReader(ireader); + ssdv = reader.getSortedSetDocValues("ssdv"); + assertEquals(valuesCount, ssdv.getValueCount()); + ireader.close(); + writer.close(); + } + } + + private static long writeAndGetDocValueFileSize(Codec codec, List values) + throws IOException { + int valuesCount = values.size(); + long dvdFileSize = -1; + try (Directory directory = newDirectory()) { + Analyzer analyzer = new StandardAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setCodec(codec); + config.setUseCompoundFile(false); + IndexWriter writer = new IndexWriter(directory, config); + for (int i = 0; i < 256; i++) { + Document doc = new Document(); + doc.add(new StringField("id", "Doc" + i, Field.Store.NO)); + // Multi value sorted-set field. + doc.add( + new SortedSetDocValuesField("ssdv_multi_", new BytesRef(values.get(i % valuesCount)))); + doc.add( + new SortedSetDocValuesField( + "ssdv_multi_", new BytesRef(values.get((i + 1) % valuesCount)))); + doc.add( + new SortedSetDocValuesField( + "ssdv_multi_", new BytesRef(values.get((i + 2) % valuesCount)))); + // Single value sorted-set field. + doc.add( + new SortedSetDocValuesField("ssdv_single_", new BytesRef(values.get(i % valuesCount)))); + // Sorted field. + doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount)))); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1); + DirectoryReader ireader = DirectoryReader.open(writer); + writer.close(); + + LeafReader reader = getOnlyLeafReader(ireader); + // Check values count. + SortedSetDocValues ssdvMulti = reader.getSortedSetDocValues("ssdv_multi_"); + assertEquals(valuesCount, ssdvMulti.getValueCount()); + for (int i = 0; i < valuesCount; i++) { + BytesRef term = ssdvMulti.lookupOrd(i); + assertTrue(term.bytes.length > 0); + } + for (int i = 0; i < valuesCount; i++) { + for (int j = 0; j < 3; j++) { + assertTrue(ssdvMulti.lookupTerm(new BytesRef(values.get((i + j) % valuesCount))) >= 0); + } + } + + SortedSetDocValues ssdvSingle = reader.getSortedSetDocValues("ssdv_single_"); + assertEquals(valuesCount, ssdvSingle.getValueCount()); + for (int i = 0; i < valuesCount; i++) { + assertTrue(ssdvSingle.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0); + } + + SortedDocValues sdv = reader.getSortedDocValues("sdv"); + assertEquals(valuesCount, sdv.getValueCount()); + for (int i = 0; i < valuesCount; i++) { + assertTrue(sdv.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0); + } + + dvdFileSize = docValueFileSize(directory); + assertTrue(dvdFileSize > 0); + ireader.close(); + } + + return dvdFileSize; + } + + static long docValueFileSize(Directory d) throws IOException { + for (String file : d.listAll()) { + if (file.endsWith(Lucene80DocValuesFormat.DATA_EXTENSION)) { + return d.fileLength(file); + } + } + return -1; + } +}