mirror of https://github.com/apache/lucene.git
LUCENE-9525: Better handle small documents with Lucene87StoredFieldsFormat. (#1876)
Instead of configuring a dictionary size and a block size, the format now tries to have 10 sub blocks per bigger block, and adapts the size of the dictionary and of the sub blocks to this overall block size.
This commit is contained in:
parent
93094ef7e4
commit
ad71bee016
|
@ -21,6 +21,7 @@ import java.util.zip.DataFormatException;
|
|||
import java.util.zip.Deflater;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.SuppressForbidden;
|
||||
|
||||
/**
|
||||
|
@ -39,16 +40,14 @@ interface BugfixDeflater_JDK8252739 {
|
|||
* on a {@code Deflater}.
|
||||
* */
|
||||
@SuppressForbidden(reason = "Works around bug, so it must call forbidden method")
|
||||
public static BugfixDeflater_JDK8252739 createBugfix(Deflater deflater, int dictLength) {
|
||||
if (dictLength < 0) {
|
||||
throw new IllegalArgumentException("dictLength must be >= 0");
|
||||
}
|
||||
public static BugfixDeflater_JDK8252739 createBugfix(Deflater deflater) {
|
||||
if (IS_BUGGY_JDK) {
|
||||
final byte[] dictBytesScratch = new byte[dictLength];
|
||||
final BytesRefBuilder dictBytesScratch = new BytesRefBuilder();
|
||||
return (dictBytes, off, len) -> {
|
||||
if (off > 0) {
|
||||
System.arraycopy(dictBytes, off, dictBytesScratch, 0, len);
|
||||
deflater.setDictionary(dictBytesScratch, 0, len);
|
||||
dictBytesScratch.grow(len);
|
||||
System.arraycopy(dictBytes, off, dictBytesScratch.bytes(), 0, len);
|
||||
deflater.setDictionary(dictBytesScratch.bytes(), 0, len);
|
||||
} else {
|
||||
deflater.setDictionary(dictBytes, off, len);
|
||||
}
|
||||
|
|
|
@ -39,20 +39,20 @@ import org.apache.lucene.util.BytesRef;
|
|||
*/
|
||||
public final class DeflateWithPresetDictCompressionMode extends CompressionMode {
|
||||
|
||||
private final int dictLength, subBlockLength;
|
||||
// Shoot for 10 sub blocks
|
||||
private static final int NUM_SUB_BLOCKS = 10;
|
||||
// And a dictionary whose size is about 6x smaller than sub blocks
|
||||
private static final int DICT_SIZE_FACTOR = 6;
|
||||
|
||||
/** Sole constructor. */
|
||||
public DeflateWithPresetDictCompressionMode(int dictLength, int subBlockLength) {
|
||||
this.dictLength = dictLength;
|
||||
this.subBlockLength = subBlockLength;
|
||||
}
|
||||
public DeflateWithPresetDictCompressionMode() {}
|
||||
|
||||
@Override
|
||||
public Compressor newCompressor() {
|
||||
// notes:
|
||||
// 3 is the highest level that doesn't have lazy match evaluation
|
||||
// 6 is the default, higher than that is just a waste of cpu
|
||||
return new DeflateWithPresetDictCompressor(6, dictLength, subBlockLength);
|
||||
return new DeflateWithPresetDictCompressor(6);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -155,18 +155,15 @@ public final class DeflateWithPresetDictCompressionMode extends CompressionMode
|
|||
|
||||
private static class DeflateWithPresetDictCompressor extends Compressor {
|
||||
|
||||
private final int dictLength, blockLength;
|
||||
final Deflater compressor;
|
||||
final BugfixDeflater_JDK8252739 deflaterBugfix;
|
||||
byte[] compressed;
|
||||
boolean closed;
|
||||
|
||||
DeflateWithPresetDictCompressor(int level, int dictLength, int blockLength) {
|
||||
DeflateWithPresetDictCompressor(int level) {
|
||||
compressor = new Deflater(level, true);
|
||||
deflaterBugfix = BugfixDeflater_JDK8252739.createBugfix(compressor, dictLength);
|
||||
deflaterBugfix = BugfixDeflater_JDK8252739.createBugfix(compressor);
|
||||
compressed = new byte[64];
|
||||
this.dictLength = dictLength;
|
||||
this.blockLength = blockLength;
|
||||
}
|
||||
|
||||
private void doCompress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
|
@ -198,7 +195,8 @@ public final class DeflateWithPresetDictCompressionMode extends CompressionMode
|
|||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
final int dictLength = Math.min(this.dictLength, len);
|
||||
final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
|
||||
final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
|
||||
out.writeVInt(dictLength);
|
||||
out.writeVInt(blockLength);
|
||||
final int end = off + len;
|
||||
|
|
|
@ -36,17 +36,17 @@ import org.apache.lucene.util.compress.LZ4;
|
|||
*/
|
||||
public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
|
||||
|
||||
private final int dictLength, subBlockLength;
|
||||
// Shoot for 10 sub blocks
|
||||
private static final int NUM_SUB_BLOCKS = 10;
|
||||
// And a dictionary whose size is about 16x smaller than sub blocks
|
||||
private static final int DICT_SIZE_FACTOR = 16;
|
||||
|
||||
/** Sole constructor. */
|
||||
public LZ4WithPresetDictCompressionMode(int dictLength, int subBlockLength) {
|
||||
this.dictLength = dictLength;
|
||||
this.subBlockLength = subBlockLength;
|
||||
}
|
||||
public LZ4WithPresetDictCompressionMode() {}
|
||||
|
||||
@Override
|
||||
public Compressor newCompressor() {
|
||||
return new LZ4WithPresetDictCompressor(dictLength, subBlockLength);
|
||||
return new LZ4WithPresetDictCompressor();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -147,18 +147,14 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
|
|||
|
||||
private static class LZ4WithPresetDictCompressor extends Compressor {
|
||||
|
||||
final int dictLength;
|
||||
final int blockLength;
|
||||
final ByteBuffersDataOutput compressed;
|
||||
final LZ4.FastCompressionHashTable hashTable;
|
||||
final byte[] buffer;
|
||||
byte[] buffer;
|
||||
|
||||
LZ4WithPresetDictCompressor(int dictLength, int blockLength) {
|
||||
LZ4WithPresetDictCompressor() {
|
||||
compressed = ByteBuffersDataOutput.newResettableInstance();
|
||||
hashTable = new LZ4.FastCompressionHashTable();
|
||||
this.dictLength = dictLength;
|
||||
this.blockLength = blockLength;
|
||||
buffer = new byte[dictLength + blockLength];
|
||||
buffer = BytesRef.EMPTY_BYTES;
|
||||
}
|
||||
|
||||
private void doCompress(byte[] bytes, int dictLen, int len, DataOutput out) throws IOException {
|
||||
|
@ -170,7 +166,9 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
|
|||
|
||||
@Override
|
||||
public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException {
|
||||
final int dictLength = Math.min(this.dictLength, len);
|
||||
final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
|
||||
final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
|
||||
buffer = ArrayUtil.grow(buffer, dictLength + blockLength);
|
||||
out.writeVInt(dictLength);
|
||||
out.writeVInt(blockLength);
|
||||
final int end = off + len;
|
||||
|
|
|
@ -151,35 +151,16 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat {
|
|||
}
|
||||
}
|
||||
|
||||
// 8kB seems to be a good trade-off between higher compression rates by not
|
||||
// having to fully bootstrap a dictionary, and indexing rate by not spending
|
||||
// too much CPU initializing data-structures to find strings in this preset
|
||||
// dictionary.
|
||||
private static final int BEST_COMPRESSION_DICT_LENGTH = 8 * 1024;
|
||||
// 48kB seems like a nice trade-off because it's small enough to keep
|
||||
// retrieval fast, yet sub blocks can find strings in a window of 26kB of
|
||||
// data on average (the window grows from 8kB to 32kB in the first 24kB, and
|
||||
// then DEFLATE can use 32kB for the last 24kB) which is close enough to the
|
||||
// maximum window length of DEFLATE of 32kB.
|
||||
private static final int BEST_COMPRESSION_SUB_BLOCK_LENGTH = 48 * 1024;
|
||||
// We shoot for 10 sub blocks per block, which should hopefully amortize the
|
||||
// space overhead of having the first 8kB compressed without any preset dict,
|
||||
// and then remove 8kB in order to avoid creating a tiny 11th sub block if
|
||||
// documents are small.
|
||||
private static final int BEST_COMPRESSION_BLOCK_LENGTH = BEST_COMPRESSION_DICT_LENGTH + 10 * BEST_COMPRESSION_SUB_BLOCK_LENGTH - 8 * 1024;
|
||||
// Shoot for 10 sub blocks of 48kB each.
|
||||
private static final int BEST_COMPRESSION_BLOCK_LENGTH = 10 * 48 * 1024;
|
||||
|
||||
/** Compression mode for {@link Mode#BEST_COMPRESSION} */
|
||||
public static final CompressionMode BEST_COMPRESSION_MODE = new DeflateWithPresetDictCompressionMode(BEST_COMPRESSION_DICT_LENGTH, BEST_COMPRESSION_SUB_BLOCK_LENGTH);
|
||||
public static final CompressionMode BEST_COMPRESSION_MODE = new DeflateWithPresetDictCompressionMode();
|
||||
|
||||
// We need to re-initialize the hash table for every sub block with the
|
||||
// content of the dictionary, so we keep it small to not hurt indexing.
|
||||
private static final int BEST_SPEED_DICT_LENGTH = 4 * 1024;
|
||||
// 60kB so that dict_length + block_length == max window size
|
||||
private static final int BEST_SPEED_SUB_BLOCK_LENGTH = 60 * 1024;
|
||||
// shoot for 10 sub blocks in addition to the dictionary
|
||||
private static final int BEST_SPEED_BLOCK_LENGTH = BEST_SPEED_DICT_LENGTH + 10 * BEST_SPEED_SUB_BLOCK_LENGTH - 8 * 1024;
|
||||
// Shoot for 10 sub blocks of 60kB each.
|
||||
private static final int BEST_SPEED_BLOCK_LENGTH = 10 * 60 * 1024;
|
||||
|
||||
/** Compression mode for {@link Mode#BEST_SPEED} */
|
||||
public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode(BEST_SPEED_DICT_LENGTH, BEST_SPEED_SUB_BLOCK_LENGTH);
|
||||
public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode();
|
||||
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ public class DeflateWithPresetCompressingCodec extends CompressingCodec {
|
|||
public DeflateWithPresetCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockSize) {
|
||||
super("DeflateWithPresetCompressingStoredFieldsData",
|
||||
withSegmentSuffix ? "DeflateWithPresetCompressingStoredFields" : "",
|
||||
new DeflateWithPresetDictCompressionMode(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
|
||||
new DeflateWithPresetDictCompressionMode(), chunkSize, maxDocsPerChunk, blockSize);
|
||||
}
|
||||
|
||||
/** No-arg constructor. */
|
||||
|
|
|
@ -25,7 +25,7 @@ public class LZ4WithPresetCompressingCodec extends CompressingCodec {
|
|||
public LZ4WithPresetCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix, int blockSize) {
|
||||
super("LZ4WithPresetCompressingStoredFieldsData",
|
||||
withSegmentSuffix ? "DeflateWithPresetCompressingStoredFields" : "",
|
||||
new LZ4WithPresetDictCompressionMode(chunkSize/10, chunkSize/3+1), chunkSize, maxDocsPerChunk, blockSize);
|
||||
new LZ4WithPresetDictCompressionMode(), chunkSize, maxDocsPerChunk, blockSize);
|
||||
}
|
||||
|
||||
/** No-arg constructor. */
|
||||
|
|
Loading…
Reference in New Issue