LUCENE-10536: Slightly better compression of doc values' terms dictionaries. (#838)

Doc values terms dictionaries keep the first term of each block uncompressed so
that they can somewhat efficiently perform binary searches across blocks.
Suffixes of the other 63 terms are compressed together using LZ4 to leverage
redundancy across suffixes. This change improves compression a bit by using the
first (uncompressed) term of each block as a dictionary when compressing
suffixes of the 63 other terms. This helps with compressing the first few
suffixes when there's not much context yet that can be leveraged to find
duplicates.
This commit is contained in:
Adrien Grand 2022-05-12 10:32:58 +02:00 committed by GitHub
parent 96036bca9f
commit 8f89db8048
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 63 additions and 19 deletions

View File

@ -141,6 +141,10 @@ Optimizations
* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums * LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
and sequences of increasing ords. (Bruno Roustant, Adrien Grand) and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
* LUCENE-10536: Doc values terms dictionaries now use the first (uncompressed)
term of each block as a dictionary when compressing suffixes of the other 63
terms of the block. (Adrien Grand)
* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader. * LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader.
(Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida) (Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida)

View File

@ -565,18 +565,26 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
LZ4.FastCompressionHashTable ht = new LZ4.FastCompressionHashTable(); LZ4.FastCompressionHashTable ht = new LZ4.FastCompressionHashTable();
ByteArrayDataOutput bufferedOutput = new ByteArrayDataOutput(termsDictBuffer); ByteArrayDataOutput bufferedOutput = new ByteArrayDataOutput(termsDictBuffer);
int dictLength = 0;
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
if ((ord & blockMask) == 0) { if ((ord & blockMask) == 0) {
if (bufferedOutput.getPosition() > 0) { if (ord != 0) {
maxBlockLength = // flush the previous block
Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht)); final int uncompressedLength =
compressAndGetTermsDictBlockLength(bufferedOutput, dictLength, ht);
maxBlockLength = Math.max(maxBlockLength, uncompressedLength);
bufferedOutput.reset(termsDictBuffer); bufferedOutput.reset(termsDictBuffer);
} }
writer.add(data.getFilePointer() - start); writer.add(data.getFilePointer() - start);
// Write the first term both to the index output, and to the buffer where we'll use it as a
// dictionary for compression
data.writeVInt(term.length); data.writeVInt(term.length);
data.writeBytes(term.bytes, term.offset, term.length); data.writeBytes(term.bytes, term.offset, term.length);
bufferedOutput = maybeGrowBuffer(bufferedOutput, term.length);
bufferedOutput.writeBytes(term.bytes, term.offset, term.length);
dictLength = term.length;
} else { } else {
final int prefixLength = StringHelper.bytesDifference(previous.get(), term); final int prefixLength = StringHelper.bytesDifference(previous.get(), term);
final int suffixLength = term.length - prefixLength; final int suffixLength = term.length - prefixLength;
@ -598,9 +606,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
++ord; ++ord;
} }
// Compress and write out the last block // Compress and write out the last block
if (bufferedOutput.getPosition() > 0) { if (bufferedOutput.getPosition() > dictLength) {
maxBlockLength = final int uncompressedLength =
Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht)); compressAndGetTermsDictBlockLength(bufferedOutput, dictLength, ht);
maxBlockLength = Math.max(maxBlockLength, uncompressedLength);
} }
writer.finish(); writer.finish();
@ -619,15 +628,12 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
} }
private int compressAndGetTermsDictBlockLength( private int compressAndGetTermsDictBlockLength(
ByteArrayDataOutput bufferedOutput, LZ4.FastCompressionHashTable ht) throws IOException { ByteArrayDataOutput bufferedOutput, int dictLength, LZ4.FastCompressionHashTable ht)
int uncompressedLength = bufferedOutput.getPosition(); throws IOException {
int uncompressedLength = bufferedOutput.getPosition() - dictLength;
data.writeVInt(uncompressedLength); data.writeVInt(uncompressedLength);
long before = data.getFilePointer(); LZ4.compressWithDictionary(termsDictBuffer, 0, dictLength, uncompressedLength, data, ht);
LZ4.compress(termsDictBuffer, 0, uncompressedLength, data, ht); return uncompressedLength;
int compressedLength = (int) (data.getFilePointer() - before);
// Block length will be used for creating buffer for decompression, one corner case is that
// compressed length might be bigger than un-compressed length, so just return the bigger one.
return Math.max(uncompressedLength, compressedLength);
} }
private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) { private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) {

View File

@ -1076,8 +1076,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength); indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength);
term = new BytesRef(entry.maxTermLength); term = new BytesRef(entry.maxTermLength);
// add the max term length for the dictionary
// add 7 padding bytes can help decompression run faster. // add 7 padding bytes can help decompression run faster.
int bufferSize = entry.maxBlockLength + LZ4_DECOMPRESSOR_PADDING; int bufferSize = entry.maxBlockLength + entry.maxTermLength + LZ4_DECOMPRESSOR_PADDING;
blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize); blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize);
} }
@ -1235,9 +1236,11 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
if (offset < entry.termsDataLength - 1) { if (offset < entry.termsDataLength - 1) {
// Avoid decompress again if we are reading a same block. // Avoid decompress again if we are reading a same block.
if (currentCompressedBlockStart != offset) { if (currentCompressedBlockStart != offset) {
int decompressLength = bytes.readVInt(); blockBuffer.offset = term.length;
// Decompress the remaining of current block blockBuffer.length = bytes.readVInt();
LZ4.decompress(bytes, decompressLength, blockBuffer.bytes, 0); // Decompress the remaining of current block, using the first term as a dictionary
System.arraycopy(term.bytes, 0, blockBuffer.bytes, 0, blockBuffer.offset);
LZ4.decompress(bytes, blockBuffer.length, blockBuffer.bytes, blockBuffer.offset);
currentCompressedBlockStart = offset; currentCompressedBlockStart = offset;
currentCompressedBlockEnd = bytes.getFilePointer(); currentCompressedBlockEnd = bytes.getFilePointer();
} else { } else {
@ -1246,7 +1249,8 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
} }
// Reset the buffer. // Reset the buffer.
blockInput = new ByteArrayDataInput(blockBuffer.bytes, 0, blockBuffer.length); blockInput =
new ByteArrayDataInput(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length);
} }
} }

View File

@ -928,4 +928,34 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
assertEquals(terms.get(i), te.term()); assertEquals(terms.get(i), te.term());
} }
} }
// Exercise the logic that leverages the first term of a block as a dictionary for suffixes of
// other terms
public void testTermsEnumDictionary() throws IOException {
Directory directory = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig();
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf);
Document doc = new Document();
SortedDocValuesField field = new SortedDocValuesField("field", new BytesRef("abc0defghijkl"));
doc.add(field);
iwriter.addDocument(doc);
field.setBytesValue(new BytesRef("abc1defghijkl"));
iwriter.addDocument(doc);
field.setBytesValue(new BytesRef("abc2defghijkl"));
iwriter.addDocument(doc);
iwriter.forceMerge(1);
iwriter.close();
IndexReader reader = DirectoryReader.open(directory);
LeafReader leafReader = getOnlyLeafReader(reader);
SortedDocValues values = leafReader.getSortedDocValues("field");
TermsEnum termsEnum = values.termsEnum();
assertEquals(new BytesRef("abc0defghijkl"), termsEnum.next());
assertEquals(new BytesRef("abc1defghijkl"), termsEnum.next());
assertEquals(new BytesRef("abc2defghijkl"), termsEnum.next());
assertNull(termsEnum.next());
reader.close();
directory.close();
}
} }