LUCENE-9525: Better handle small documents with Lucene87StoredFieldsFormat. (#1876)

Instead of configuring a dictionary size and a block size, the format
now tries to have 10 sub blocks per bigger block, and adapts the size of
the dictionary and of the sub blocks to this overall block size.
This commit is contained in:
Adrien Grand 2020-09-16 13:09:00 +02:00 committed by GitHub
parent 93094ef7e4
commit ad71bee016
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 36 additions and 60 deletions

View File

@ -21,6 +21,7 @@ import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.SuppressForbidden;
/**
@ -39,16 +40,14 @@ interface BugfixDeflater_JDK8252739 {
* on a {@code Deflater}.
* */
@SuppressForbidden(reason = "Works around bug, so it must call forbidden method")
public static BugfixDeflater_JDK8252739 createBugfix(Deflater deflater, int dictLength) {
if (dictLength < 0) {
throw new IllegalArgumentException("dictLength must be >= 0");
}
public static BugfixDeflater_JDK8252739 createBugfix(Deflater deflater) {
if (IS_BUGGY_JDK) {
final byte[] dictBytesScratch = new byte[dictLength];
final BytesRefBuilder dictBytesScratch = new BytesRefBuilder();
return (dictBytes, off, len) -> {
if (off > 0) {
System.arraycopy(dictBytes, off, dictBytesScratch, 0, len);
deflater.setDictionary(dictBytesScratch, 0, len);
dictBytesScratch.grow(len);
System.arraycopy(dictBytes, off, dictBytesScratch.bytes(), 0, len);
deflater.setDictionary(dictBytesScratch.bytes(), 0, len);
} else {
deflater.setDictionary(dictBytes, off, len);
}

View File

@ -39,20 +39,20 @@ import org.apache.lucene.util.BytesRef;
*/
public final class DeflateWithPresetDictCompressionMode extends CompressionMode {
private final int dictLength, subBlockLength;
// Shoot for 10 sub blocks
private static final int NUM_SUB_BLOCKS = 10;
// And a dictionary whose size is about 6x smaller than sub blocks
private static final int DICT_SIZE_FACTOR = 6;
/** Sole constructor. */
public DeflateWithPresetDictCompressionMode(int dictLength, int subBlockLength) {
this.dictLength = dictLength;
this.subBlockLength = subBlockLength;
}
public DeflateWithPresetDictCompressionMode() {}
@Override
public Compressor newCompressor() {
// notes:
// 3 is the highest level that doesn't have lazy match evaluation
// 6 is the default, higher than that is just a waste of cpu
return new DeflateWithPresetDictCompressor(6, dictLength, subBlockLength);
return new DeflateWithPresetDictCompressor(6);
}
@Override
@ -155,18 +155,15 @@ public final class DeflateWithPresetDictCompressionMode extends CompressionMode
private static class DeflateWithPresetDictCompressor extends Compressor {
private final int dictLength, blockLength;
final Deflater compressor;
final BugfixDeflater_JDK8252739 deflaterBugfix;
byte[] compressed;
boolean closed;
DeflateWithPresetDictCompressor(int level, int dictLength, int blockLength) {
DeflateWithPresetDictCompressor(int level) {
compressor = new Deflater(level, true);
deflaterBugfix = BugfixDeflater_JDK8252739.createBugfix(compressor, dictLength);
deflaterBugfix = BugfixDeflater_JDK8252739.createBugfix(compressor);
compressed = new byte[64];
this.dictLength = dictLength;
this.blockLength = blockLength;
}
private void doCompress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
@ -198,7 +195,8 @@ public final class DeflateWithPresetDictCompressionMode extends CompressionMode
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
final int dictLength = Math.min(this.dictLength, len);
final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
out.writeVInt(dictLength);
out.writeVInt(blockLength);
final int end = off + len;

View File

@ -36,17 +36,17 @@ import org.apache.lucene.util.compress.LZ4;
*/
public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
private final int dictLength, subBlockLength;
// Shoot for 10 sub blocks
private static final int NUM_SUB_BLOCKS = 10;
// And a dictionary whose size is about 16x smaller than sub blocks
private static final int DICT_SIZE_FACTOR = 16;
/** Sole constructor. */
public LZ4WithPresetDictCompressionMode(int dictLength, int subBlockLength) {
this.dictLength = dictLength;
this.subBlockLength = subBlockLength;
}
public LZ4WithPresetDictCompressionMode() {}
@Override
public Compressor newCompressor() {
return new LZ4WithPresetDictCompressor(dictLength, subBlockLength);
return new LZ4WithPresetDictCompressor();
}
@Override
@ -147,18 +147,14 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
private static class LZ4WithPresetDictCompressor extends Compressor {
final int dictLength;
final int blockLength;
final ByteBuffersDataOutput compressed;
final LZ4.FastCompressionHashTable hashTable;
final byte[] buffer;
byte[] buffer;
LZ4WithPresetDictCompressor(int dictLength, int blockLength) {
LZ4WithPresetDictCompressor() {
compressed = ByteBuffersDataOutput.newResettableInstance();
hashTable = new LZ4.FastCompressionHashTable();
this.dictLength = dictLength;
this.blockLength = blockLength;
buffer = new byte[dictLength + blockLength];
buffer = BytesRef.EMPTY_BYTES;
}
private void doCompress(byte[] bytes, int dictLen, int len, DataOutput out) throws IOException {
@ -170,7 +166,9 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
@Override
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
final int dictLength = Math.min(this.dictLength, len);
final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
buffer = ArrayUtil.grow(buffer, dictLength + blockLength);
out.writeVInt(dictLength);
out.writeVInt(blockLength);
final int end = off + len;

View File

@ -151,35 +151,16 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat {
}
}
// 8kB seems to be a good trade-off between higher compression rates by not
// having to fully bootstrap a dictionary, and indexing rate by not spending
// too much CPU initializing data-structures to find strings in this preset
// dictionary.
private static final int BEST_COMPRESSION_DICT_LENGTH = 8 * 1024;
// 48kB seems like a nice trade-off because it's small enough to keep
// retrieval fast, yet sub blocks can find strings in a window of 26kB of
// data on average (the window grows from 8kB to 32kB in the first 24kB, and
// then DEFLATE can use 32kB for the last 24kB) which is close enough to the
// maximum window length of DEFLATE of 32kB.
private static final int BEST_COMPRESSION_SUB_BLOCK_LENGTH = 48 * 1024;
// We shoot for 10 sub blocks per block, which should hopefully amortize the
// space overhead of having the first 8kB compressed without any preset dict,
// and then remove 8kB in order to avoid creating a tiny 11th sub block if
// documents are small.
private static final int BEST_COMPRESSION_BLOCK_LENGTH = BEST_COMPRESSION_DICT_LENGTH + 10 * BEST_COMPRESSION_SUB_BLOCK_LENGTH - 8 * 1024;
// Shoot for 10 sub blocks of 48kB each.
private static final int BEST_COMPRESSION_BLOCK_LENGTH = 10 * 48 * 1024;
/** Compression mode for {@link Mode#BEST_COMPRESSION} */
public static final CompressionMode BEST_COMPRESSION_MODE = new DeflateWithPresetDictCompressionMode(BEST_COMPRESSION_DICT_LENGTH, BEST_COMPRESSION_SUB_BLOCK_LENGTH);
public static final CompressionMode BEST_COMPRESSION_MODE = new DeflateWithPresetDictCompressionMode();
// We need to re-initialize the hash table for every sub block with the
// content of the dictionary, so we keep it small to not hurt indexing.
private static final int BEST_SPEED_DICT_LENGTH = 4 * 1024;
// 60kB so that dict_length + block_length == max window size
private static final int BEST_SPEED_SUB_BLOCK_LENGTH = 60 * 1024;
// shoot for 10 sub blocks in addition to the dictionary
private static final int BEST_SPEED_BLOCK_LENGTH = BEST_SPEED_DICT_LENGTH + 10 * BEST_SPEED_SUB_BLOCK_LENGTH - 8 * 1024;
// Shoot for 10 sub blocks of 60kB each.
private static final int BEST_SPEED_BLOCK_LENGTH = 10 * 60 * 1024;
/** Compression mode for {@link Mode#BEST_SPEED} */
public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode(BEST_SPEED_DICT_LENGTH, BEST_SPEED_SUB_BLOCK_LENGTH);
public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode();
}

View File

@ -25,7 +25,7 @@ public class DeflateWithPresetCompressingCodec extends CompressingCodec {
public DeflateWithPresetCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockSize) {
super("DeflateWithPresetCompressingStoredFieldsData",
withSegmentSuffix ? "DeflateWithPresetCompressingStoredFields" : "",
new DeflateWithPresetDictCompressionMode(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
new DeflateWithPresetDictCompressionMode(), chunkSize, maxDocsPerChunk, blockSize);
}
/** No-arg constructor. */

View File

@ -25,7 +25,7 @@ public class LZ4WithPresetCompressingCodec extends CompressingCodec {
public LZ4WithPresetCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockSize) {
super("LZ4WithPresetCompressingStoredFieldsData",
withSegmentSuffix ? "DeflateWithPresetCompressingStoredFields" : "",
new LZ4WithPresetDictCompressionMode(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
new LZ4WithPresetDictCompressionMode(), chunkSize, maxDocsPerChunk, blockSize);
}
/** No-arg constructor. */