mirror of https://github.com/apache/lucene.git
LUCENE-6089: Tune CompressionMode.HIGH_COMPRESSION
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1642981 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
34bee4999d
commit
834c63b932
|
@ -48,15 +48,16 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
private final String segmentSuffix;
|
private final String segmentSuffix;
|
||||||
private final CompressionMode compressionMode;
|
private final CompressionMode compressionMode;
|
||||||
private final int chunkSize;
|
private final int chunkSize;
|
||||||
|
private final int maxDocsPerChunk;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new {@link CompressingStoredFieldsFormat} with an empty segment
|
* Create a new {@link CompressingStoredFieldsFormat} with an empty segment
|
||||||
* suffix.
|
* suffix.
|
||||||
*
|
*
|
||||||
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int)
|
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int, int)
|
||||||
*/
|
*/
|
||||||
public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize) {
|
public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
|
||||||
this(formatName, "", compressionMode, chunkSize);
|
this(formatName, "", compressionMode, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -79,6 +80,8 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
* <code>chunkSize</code> is the minimum byte size of a chunk of documents.
|
* <code>chunkSize</code> is the minimum byte size of a chunk of documents.
|
||||||
* A value of <code>1</code> can make sense if there is redundancy across
|
* A value of <code>1</code> can make sense if there is redundancy across
|
||||||
* fields.
|
* fields.
|
||||||
|
* <code>maxDocsPerChunk</code> is an upperbound on how many docs may be stored
|
||||||
|
* in a single chunk. This is to bound the cpu costs for highly compressible data.
|
||||||
* <p>
|
* <p>
|
||||||
* Higher values of <code>chunkSize</code> should improve the compression
|
* Higher values of <code>chunkSize</code> should improve the compression
|
||||||
* ratio but will require more memory at indexing time and might make document
|
* ratio but will require more memory at indexing time and might make document
|
||||||
|
@ -88,10 +91,11 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
* @param formatName the name of the {@link StoredFieldsFormat}
|
* @param formatName the name of the {@link StoredFieldsFormat}
|
||||||
* @param compressionMode the {@link CompressionMode} to use
|
* @param compressionMode the {@link CompressionMode} to use
|
||||||
* @param chunkSize the minimum number of bytes of a single chunk of stored documents
|
* @param chunkSize the minimum number of bytes of a single chunk of stored documents
|
||||||
|
* @param maxDocsPerChunk the maximum number of documents in a single chunk
|
||||||
* @see CompressionMode
|
* @see CompressionMode
|
||||||
*/
|
*/
|
||||||
public CompressingStoredFieldsFormat(String formatName, String segmentSuffix,
|
public CompressingStoredFieldsFormat(String formatName, String segmentSuffix,
|
||||||
CompressionMode compressionMode, int chunkSize) {
|
CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
|
||||||
this.formatName = formatName;
|
this.formatName = formatName;
|
||||||
this.segmentSuffix = segmentSuffix;
|
this.segmentSuffix = segmentSuffix;
|
||||||
this.compressionMode = compressionMode;
|
this.compressionMode = compressionMode;
|
||||||
|
@ -99,7 +103,10 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
throw new IllegalArgumentException("chunkSize must be >= 1");
|
throw new IllegalArgumentException("chunkSize must be >= 1");
|
||||||
}
|
}
|
||||||
this.chunkSize = chunkSize;
|
this.chunkSize = chunkSize;
|
||||||
|
if (maxDocsPerChunk < 1) {
|
||||||
|
throw new IllegalArgumentException("maxDocsPerChunk must be >= 1");
|
||||||
|
}
|
||||||
|
this.maxDocsPerChunk = maxDocsPerChunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -113,13 +120,13 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si,
|
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si,
|
||||||
IOContext context) throws IOException {
|
IOContext context) throws IOException {
|
||||||
return new CompressingStoredFieldsWriter(directory, si, segmentSuffix, context,
|
return new CompressingStoredFieldsWriter(directory, si, segmentSuffix, context,
|
||||||
formatName, compressionMode, chunkSize);
|
formatName, compressionMode, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return getClass().getSimpleName() + "(compressionMode=" + compressionMode
|
return getClass().getSimpleName() + "(compressionMode=" + compressionMode
|
||||||
+ ", chunkSize=" + chunkSize + ")";
|
+ ", chunkSize=" + chunkSize + ", maxDocsPerChunk=" + maxDocsPerChunk + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,9 +55,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
/** Extension of stored fields index file */
|
/** Extension of stored fields index file */
|
||||||
public static final String FIELDS_INDEX_EXTENSION = "fdx";
|
public static final String FIELDS_INDEX_EXTENSION = "fdx";
|
||||||
|
|
||||||
// hard limit on the maximum number of documents per chunk
|
|
||||||
static final int MAX_DOCUMENTS_PER_CHUNK = 128;
|
|
||||||
|
|
||||||
static final int STRING = 0x00;
|
static final int STRING = 0x00;
|
||||||
static final int BYTE_ARR = 0x01;
|
static final int BYTE_ARR = 0x01;
|
||||||
static final int NUMERIC_INT = 0x02;
|
static final int NUMERIC_INT = 0x02;
|
||||||
|
@ -82,6 +79,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
private final CompressionMode compressionMode;
|
private final CompressionMode compressionMode;
|
||||||
private final Compressor compressor;
|
private final Compressor compressor;
|
||||||
private final int chunkSize;
|
private final int chunkSize;
|
||||||
|
private final int maxDocsPerChunk;
|
||||||
|
|
||||||
private final GrowableByteArrayDataOutput bufferedDocs;
|
private final GrowableByteArrayDataOutput bufferedDocs;
|
||||||
private int[] numStoredFields; // number of stored fields
|
private int[] numStoredFields; // number of stored fields
|
||||||
|
@ -91,7 +89,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
|
public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
|
||||||
String formatName, CompressionMode compressionMode, int chunkSize) throws IOException {
|
String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
|
||||||
assert directory != null;
|
assert directory != null;
|
||||||
this.directory = directory;
|
this.directory = directory;
|
||||||
this.segment = si.name;
|
this.segment = si.name;
|
||||||
|
@ -99,6 +97,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
this.compressionMode = compressionMode;
|
this.compressionMode = compressionMode;
|
||||||
this.compressor = compressionMode.newCompressor();
|
this.compressor = compressionMode.newCompressor();
|
||||||
this.chunkSize = chunkSize;
|
this.chunkSize = chunkSize;
|
||||||
|
this.maxDocsPerChunk = maxDocsPerChunk;
|
||||||
this.docBase = 0;
|
this.docBase = 0;
|
||||||
this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
|
this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
|
||||||
this.numStoredFields = new int[16];
|
this.numStoredFields = new int[16];
|
||||||
|
@ -210,7 +209,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
||||||
|
|
||||||
private boolean triggerFlush() {
|
private boolean triggerFlush() {
|
||||||
return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
|
return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
|
||||||
numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK;
|
numBufferedDocs >= maxDocsPerChunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void flush() throws IOException {
|
private void flush() throws IOException {
|
||||||
|
|
|
@ -70,7 +70,8 @@ public abstract class CompressionMode {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Compressor newCompressor() {
|
public Compressor newCompressor() {
|
||||||
return new DeflateCompressor(Deflater.BEST_COMPRESSION);
|
// 3 is the highest level that doesn't have lazy match evaluation
|
||||||
|
return new DeflateCompressor(3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -118,7 +118,7 @@ public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFor
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
public Lucene50StoredFieldsFormat() {
|
public Lucene50StoredFieldsFormat() {
|
||||||
super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14);
|
super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,16 +36,16 @@ public abstract class CompressingCodec extends FilterCodec {
|
||||||
/**
|
/**
|
||||||
* Create a random instance.
|
* Create a random instance.
|
||||||
*/
|
*/
|
||||||
public static CompressingCodec randomInstance(Random random, int chunkSize, boolean withSegmentSuffix) {
|
public static CompressingCodec randomInstance(Random random, int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
|
||||||
switch (random.nextInt(4)) {
|
switch (random.nextInt(4)) {
|
||||||
case 0:
|
case 0:
|
||||||
return new FastCompressingCodec(chunkSize, withSegmentSuffix);
|
return new FastCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
|
||||||
case 1:
|
case 1:
|
||||||
return new FastDecompressionCompressingCodec(chunkSize, withSegmentSuffix);
|
return new FastDecompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
|
||||||
case 2:
|
case 2:
|
||||||
return new HighCompressionCompressingCodec(chunkSize, withSegmentSuffix);
|
return new HighCompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
|
||||||
case 3:
|
case 3:
|
||||||
return new DummyCompressingCodec(chunkSize, withSegmentSuffix);
|
return new DummyCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
|
||||||
default:
|
default:
|
||||||
throw new AssertionError();
|
throw new AssertionError();
|
||||||
}
|
}
|
||||||
|
@ -56,14 +56,14 @@ public abstract class CompressingCodec extends FilterCodec {
|
||||||
* suffix
|
* suffix
|
||||||
*/
|
*/
|
||||||
public static CompressingCodec randomInstance(Random random) {
|
public static CompressingCodec randomInstance(Random random) {
|
||||||
return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), false);
|
return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a random {@link CompressingCodec} that is using a segment suffix
|
* Creates a random {@link CompressingCodec} that is using a segment suffix
|
||||||
*/
|
*/
|
||||||
public static CompressingCodec randomInstance(Random random, boolean withSegmentSuffix) {
|
public static CompressingCodec randomInstance(Random random, boolean withSegmentSuffix) {
|
||||||
return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), withSegmentSuffix);
|
return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), withSegmentSuffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final CompressingStoredFieldsFormat storedFieldsFormat;
|
private final CompressingStoredFieldsFormat storedFieldsFormat;
|
||||||
|
@ -72,17 +72,17 @@ public abstract class CompressingCodec extends FilterCodec {
|
||||||
/**
|
/**
|
||||||
* Creates a compressing codec with a given segment suffix
|
* Creates a compressing codec with a given segment suffix
|
||||||
*/
|
*/
|
||||||
public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize) {
|
public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
|
||||||
super(name, TestUtil.getDefaultCodec());
|
super(name, TestUtil.getDefaultCodec());
|
||||||
this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize);
|
this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize, maxDocsPerChunk);
|
||||||
this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize);
|
this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a compressing codec with an empty segment suffix
|
* Creates a compressing codec with an empty segment suffix
|
||||||
*/
|
*/
|
||||||
public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize) {
|
public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
|
||||||
this(name, "", compressionMode, chunkSize);
|
this(name, "", compressionMode, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compressing;
|
||||||
public class FastCompressingCodec extends CompressingCodec {
|
public class FastCompressingCodec extends CompressingCodec {
|
||||||
|
|
||||||
/** Constructor that allows to configure the chunk size. */
|
/** Constructor that allows to configure the chunk size. */
|
||||||
public FastCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
|
public FastCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
|
||||||
super("FastCompressingStoredFields",
|
super("FastCompressingStoredFields",
|
||||||
withSegmentSuffix ? "FastCompressingStoredFields" : "",
|
withSegmentSuffix ? "FastCompressingStoredFields" : "",
|
||||||
CompressionMode.FAST, chunkSize);
|
CompressionMode.FAST, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Default constructor. */
|
/** Default constructor. */
|
||||||
public FastCompressingCodec() {
|
public FastCompressingCodec() {
|
||||||
this(1 << 14, false);
|
this(1 << 14, 128, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compressing;
|
||||||
public class FastDecompressionCompressingCodec extends CompressingCodec {
|
public class FastDecompressionCompressingCodec extends CompressingCodec {
|
||||||
|
|
||||||
/** Constructor that allows to configure the chunk size. */
|
/** Constructor that allows to configure the chunk size. */
|
||||||
public FastDecompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
|
public FastDecompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
|
||||||
super("FastDecompressionCompressingStoredFields",
|
super("FastDecompressionCompressingStoredFields",
|
||||||
withSegmentSuffix ? "FastDecompressionCompressingStoredFields" : "",
|
withSegmentSuffix ? "FastDecompressionCompressingStoredFields" : "",
|
||||||
CompressionMode.FAST_DECOMPRESSION, chunkSize);
|
CompressionMode.FAST_DECOMPRESSION, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Default constructor. */
|
/** Default constructor. */
|
||||||
public FastDecompressionCompressingCodec() {
|
public FastDecompressionCompressingCodec() {
|
||||||
this(1 << 14, false);
|
this(1 << 14, 256, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,14 +21,16 @@ package org.apache.lucene.codecs.compressing;
|
||||||
public class HighCompressionCompressingCodec extends CompressingCodec {
|
public class HighCompressionCompressingCodec extends CompressingCodec {
|
||||||
|
|
||||||
/** Constructor that allows to configure the chunk size. */
|
/** Constructor that allows to configure the chunk size. */
|
||||||
public HighCompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
|
public HighCompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
|
||||||
super("HighCompressionCompressingStoredFields",
|
super("HighCompressionCompressingStoredFields",
|
||||||
withSegmentSuffix ? "HighCompressionCompressingStoredFields" : "",
|
withSegmentSuffix ? "HighCompressionCompressingStoredFields" : "",
|
||||||
CompressionMode.HIGH_COMPRESSION, chunkSize);
|
CompressionMode.HIGH_COMPRESSION, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Default constructor. */
|
/** Default constructor. */
|
||||||
public HighCompressionCompressingCodec() {
|
public HighCompressionCompressingCodec() {
|
||||||
this(1 << 14, false);
|
// no need to have a higher block length than 32KB since deflate splits
|
||||||
|
// into blocks of 32KB anyway, and this is a lower bound (try to avoid > 32KB)
|
||||||
|
this(24576, 512, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -83,15 +83,15 @@ public class DummyCompressingCodec extends CompressingCodec {
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Constructor that allows to configure the chunk size. */
|
/** Constructor that allows to configure the chunk size. */
|
||||||
public DummyCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
|
public DummyCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
|
||||||
super("DummyCompressingStoredFields",
|
super("DummyCompressingStoredFields",
|
||||||
withSegmentSuffix ? "DummyCompressingStoredFields" : "",
|
withSegmentSuffix ? "DummyCompressingStoredFields" : "",
|
||||||
DUMMY, chunkSize);
|
DUMMY, chunkSize, maxDocsPerChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Default constructor. */
|
/** Default constructor. */
|
||||||
public DummyCompressingCodec() {
|
public DummyCompressingCodec() {
|
||||||
this(1 << 14, false);
|
this(1 << 14, 128, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue