mirror of https://github.com/apache/lucene.git
LUCENE-10536: Slightly better compression of doc values' terms dictionaries. (#838)
Doc values terms dictionaries keep the first term of each block uncompressed so that they can somewhat efficiently perform binary searches across blocks. Suffixes of the other 63 terms are compressed together using LZ4 to leverage redundancy across suffixes. This change improves compression a bit by using the first (uncompressed) term of each block as a dictionary when compressing suffixes of the 63 other terms. This helps with compressing the first few suffixes when there's not much context yet that can be leveraged to find duplicates.
This commit is contained in:
parent
96036bca9f
commit
8f89db8048
|
@ -141,6 +141,10 @@ Optimizations
|
|||
* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
|
||||
and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
|
||||
|
||||
* LUCENE-10536: Doc values terms dictionaries now use the first (uncompressed)
|
||||
term of each block as a dictionary when compressing suffixes of the other 63
|
||||
terms of the block. (Adrien Grand)
|
||||
|
||||
* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader.
|
||||
(Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida)
|
||||
|
||||
|
|
|
@ -565,18 +565,26 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
|
||||
LZ4.FastCompressionHashTable ht = new LZ4.FastCompressionHashTable();
|
||||
ByteArrayDataOutput bufferedOutput = new ByteArrayDataOutput(termsDictBuffer);
|
||||
int dictLength = 0;
|
||||
|
||||
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||
if ((ord & blockMask) == 0) {
|
||||
if (bufferedOutput.getPosition() > 0) {
|
||||
maxBlockLength =
|
||||
Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
|
||||
if (ord != 0) {
|
||||
// flush the previous block
|
||||
final int uncompressedLength =
|
||||
compressAndGetTermsDictBlockLength(bufferedOutput, dictLength, ht);
|
||||
maxBlockLength = Math.max(maxBlockLength, uncompressedLength);
|
||||
bufferedOutput.reset(termsDictBuffer);
|
||||
}
|
||||
|
||||
writer.add(data.getFilePointer() - start);
|
||||
// Write the first term both to the index output, and to the buffer where we'll use it as a
|
||||
// dictionary for compression
|
||||
data.writeVInt(term.length);
|
||||
data.writeBytes(term.bytes, term.offset, term.length);
|
||||
bufferedOutput = maybeGrowBuffer(bufferedOutput, term.length);
|
||||
bufferedOutput.writeBytes(term.bytes, term.offset, term.length);
|
||||
dictLength = term.length;
|
||||
} else {
|
||||
final int prefixLength = StringHelper.bytesDifference(previous.get(), term);
|
||||
final int suffixLength = term.length - prefixLength;
|
||||
|
@ -598,9 +606,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
++ord;
|
||||
}
|
||||
// Compress and write out the last block
|
||||
if (bufferedOutput.getPosition() > 0) {
|
||||
maxBlockLength =
|
||||
Math.max(maxBlockLength, compressAndGetTermsDictBlockLength(bufferedOutput, ht));
|
||||
if (bufferedOutput.getPosition() > dictLength) {
|
||||
final int uncompressedLength =
|
||||
compressAndGetTermsDictBlockLength(bufferedOutput, dictLength, ht);
|
||||
maxBlockLength = Math.max(maxBlockLength, uncompressedLength);
|
||||
}
|
||||
|
||||
writer.finish();
|
||||
|
@ -619,15 +628,12 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
}
|
||||
|
||||
private int compressAndGetTermsDictBlockLength(
|
||||
ByteArrayDataOutput bufferedOutput, LZ4.FastCompressionHashTable ht) throws IOException {
|
||||
int uncompressedLength = bufferedOutput.getPosition();
|
||||
ByteArrayDataOutput bufferedOutput, int dictLength, LZ4.FastCompressionHashTable ht)
|
||||
throws IOException {
|
||||
int uncompressedLength = bufferedOutput.getPosition() - dictLength;
|
||||
data.writeVInt(uncompressedLength);
|
||||
long before = data.getFilePointer();
|
||||
LZ4.compress(termsDictBuffer, 0, uncompressedLength, data, ht);
|
||||
int compressedLength = (int) (data.getFilePointer() - before);
|
||||
// Block length will be used for creating buffer for decompression, one corner case is that
|
||||
// compressed length might be bigger than un-compressed length, so just return the bigger one.
|
||||
return Math.max(uncompressedLength, compressedLength);
|
||||
LZ4.compressWithDictionary(termsDictBuffer, 0, dictLength, uncompressedLength, data, ht);
|
||||
return uncompressedLength;
|
||||
}
|
||||
|
||||
private ByteArrayDataOutput maybeGrowBuffer(ByteArrayDataOutput bufferedOutput, int termLength) {
|
||||
|
|
|
@ -1076,8 +1076,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength);
|
||||
term = new BytesRef(entry.maxTermLength);
|
||||
|
||||
// add the max term length for the dictionary
|
||||
// add 7 padding bytes can help decompression run faster.
|
||||
int bufferSize = entry.maxBlockLength + LZ4_DECOMPRESSOR_PADDING;
|
||||
int bufferSize = entry.maxBlockLength + entry.maxTermLength + LZ4_DECOMPRESSOR_PADDING;
|
||||
blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize);
|
||||
}
|
||||
|
||||
|
@ -1235,9 +1236,11 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
if (offset < entry.termsDataLength - 1) {
|
||||
// Avoid decompress again if we are reading a same block.
|
||||
if (currentCompressedBlockStart != offset) {
|
||||
int decompressLength = bytes.readVInt();
|
||||
// Decompress the remaining of current block
|
||||
LZ4.decompress(bytes, decompressLength, blockBuffer.bytes, 0);
|
||||
blockBuffer.offset = term.length;
|
||||
blockBuffer.length = bytes.readVInt();
|
||||
// Decompress the remaining of current block, using the first term as a dictionary
|
||||
System.arraycopy(term.bytes, 0, blockBuffer.bytes, 0, blockBuffer.offset);
|
||||
LZ4.decompress(bytes, blockBuffer.length, blockBuffer.bytes, blockBuffer.offset);
|
||||
currentCompressedBlockStart = offset;
|
||||
currentCompressedBlockEnd = bytes.getFilePointer();
|
||||
} else {
|
||||
|
@ -1246,7 +1249,8 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
|
||||
// Reset the buffer.
|
||||
blockInput = new ByteArrayDataInput(blockBuffer.bytes, 0, blockBuffer.length);
|
||||
blockInput =
|
||||
new ByteArrayDataInput(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -928,4 +928,34 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
assertEquals(terms.get(i), te.term());
|
||||
}
|
||||
}
|
||||
|
||||
// Exercise the logic that leverages the first term of a block as a dictionary for suffixes of
|
||||
// other terms
|
||||
public void testTermsEnumDictionary() throws IOException {
|
||||
Directory directory = newDirectory();
|
||||
IndexWriterConfig conf = newIndexWriterConfig();
|
||||
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf);
|
||||
Document doc = new Document();
|
||||
SortedDocValuesField field = new SortedDocValuesField("field", new BytesRef("abc0defghijkl"));
|
||||
doc.add(field);
|
||||
iwriter.addDocument(doc);
|
||||
field.setBytesValue(new BytesRef("abc1defghijkl"));
|
||||
iwriter.addDocument(doc);
|
||||
field.setBytesValue(new BytesRef("abc2defghijkl"));
|
||||
iwriter.addDocument(doc);
|
||||
iwriter.forceMerge(1);
|
||||
iwriter.close();
|
||||
|
||||
IndexReader reader = DirectoryReader.open(directory);
|
||||
LeafReader leafReader = getOnlyLeafReader(reader);
|
||||
SortedDocValues values = leafReader.getSortedDocValues("field");
|
||||
TermsEnum termsEnum = values.termsEnum();
|
||||
assertEquals(new BytesRef("abc0defghijkl"), termsEnum.next());
|
||||
assertEquals(new BytesRef("abc1defghijkl"), termsEnum.next());
|
||||
assertEquals(new BytesRef("abc2defghijkl"), termsEnum.next());
|
||||
assertNull(termsEnum.next());
|
||||
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue