LUCENE-6089: Tune CompressionMode.HIGH_COMPRESSION

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1642981 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-12-02 18:20:03 +00:00
parent 34bee4999d
commit 834c63b932
9 changed files with 46 additions and 37 deletions

View File

@ -48,15 +48,16 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
private final String segmentSuffix; private final String segmentSuffix;
private final CompressionMode compressionMode; private final CompressionMode compressionMode;
private final int chunkSize; private final int chunkSize;
private final int maxDocsPerChunk;
/** /**
* Create a new {@link CompressingStoredFieldsFormat} with an empty segment * Create a new {@link CompressingStoredFieldsFormat} with an empty segment
* suffix. * suffix.
* *
* @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int) * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int, int)
*/ */
public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize) { public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
this(formatName, "", compressionMode, chunkSize); this(formatName, "", compressionMode, chunkSize, maxDocsPerChunk);
} }
/** /**
@ -79,6 +80,8 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
* <code>chunkSize</code> is the minimum byte size of a chunk of documents. * <code>chunkSize</code> is the minimum byte size of a chunk of documents.
* A value of <code>1</code> can make sense if there is redundancy across * A value of <code>1</code> can make sense if there is redundancy across
* fields. * fields.
* <code>maxDocsPerChunk</code> is an upperbound on how many docs may be stored
* in a single chunk. This is to bound the cpu costs for highly compressible data.
* <p> * <p>
* Higher values of <code>chunkSize</code> should improve the compression * Higher values of <code>chunkSize</code> should improve the compression
* ratio but will require more memory at indexing time and might make document * ratio but will require more memory at indexing time and might make document
@ -88,10 +91,11 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
* @param formatName the name of the {@link StoredFieldsFormat} * @param formatName the name of the {@link StoredFieldsFormat}
* @param compressionMode the {@link CompressionMode} to use * @param compressionMode the {@link CompressionMode} to use
* @param chunkSize the minimum number of bytes of a single chunk of stored documents * @param chunkSize the minimum number of bytes of a single chunk of stored documents
* @param maxDocsPerChunk the maximum number of documents in a single chunk
* @see CompressionMode * @see CompressionMode
*/ */
public CompressingStoredFieldsFormat(String formatName, String segmentSuffix, public CompressingStoredFieldsFormat(String formatName, String segmentSuffix,
CompressionMode compressionMode, int chunkSize) { CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
this.formatName = formatName; this.formatName = formatName;
this.segmentSuffix = segmentSuffix; this.segmentSuffix = segmentSuffix;
this.compressionMode = compressionMode; this.compressionMode = compressionMode;
@ -99,7 +103,10 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
throw new IllegalArgumentException("chunkSize must be >= 1"); throw new IllegalArgumentException("chunkSize must be >= 1");
} }
this.chunkSize = chunkSize; this.chunkSize = chunkSize;
if (maxDocsPerChunk < 1) {
throw new IllegalArgumentException("maxDocsPerChunk must be >= 1");
}
this.maxDocsPerChunk = maxDocsPerChunk;
} }
@Override @Override
@ -113,13 +120,13 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat {
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si,
IOContext context) throws IOException { IOContext context) throws IOException {
return new CompressingStoredFieldsWriter(directory, si, segmentSuffix, context, return new CompressingStoredFieldsWriter(directory, si, segmentSuffix, context,
formatName, compressionMode, chunkSize); formatName, compressionMode, chunkSize, maxDocsPerChunk);
} }
@Override @Override
public String toString() { public String toString() {
return getClass().getSimpleName() + "(compressionMode=" + compressionMode return getClass().getSimpleName() + "(compressionMode=" + compressionMode
+ ", chunkSize=" + chunkSize + ")"; + ", chunkSize=" + chunkSize + ", maxDocsPerChunk=" + maxDocsPerChunk + ")";
} }
} }

View File

@ -55,9 +55,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
/** Extension of stored fields index file */ /** Extension of stored fields index file */
public static final String FIELDS_INDEX_EXTENSION = "fdx"; public static final String FIELDS_INDEX_EXTENSION = "fdx";
// hard limit on the maximum number of documents per chunk
static final int MAX_DOCUMENTS_PER_CHUNK = 128;
static final int STRING = 0x00; static final int STRING = 0x00;
static final int BYTE_ARR = 0x01; static final int BYTE_ARR = 0x01;
static final int NUMERIC_INT = 0x02; static final int NUMERIC_INT = 0x02;
@ -82,6 +79,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
private final CompressionMode compressionMode; private final CompressionMode compressionMode;
private final Compressor compressor; private final Compressor compressor;
private final int chunkSize; private final int chunkSize;
private final int maxDocsPerChunk;
private final GrowableByteArrayDataOutput bufferedDocs; private final GrowableByteArrayDataOutput bufferedDocs;
private int[] numStoredFields; // number of stored fields private int[] numStoredFields; // number of stored fields
@ -91,7 +89,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
/** Sole constructor. */ /** Sole constructor. */
public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context, public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
String formatName, CompressionMode compressionMode, int chunkSize) throws IOException { String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
assert directory != null; assert directory != null;
this.directory = directory; this.directory = directory;
this.segment = si.name; this.segment = si.name;
@ -99,6 +97,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
this.compressionMode = compressionMode; this.compressionMode = compressionMode;
this.compressor = compressionMode.newCompressor(); this.compressor = compressionMode.newCompressor();
this.chunkSize = chunkSize; this.chunkSize = chunkSize;
this.maxDocsPerChunk = maxDocsPerChunk;
this.docBase = 0; this.docBase = 0;
this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize); this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
this.numStoredFields = new int[16]; this.numStoredFields = new int[16];
@ -210,7 +209,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
private boolean triggerFlush() { private boolean triggerFlush() {
return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK; numBufferedDocs >= maxDocsPerChunk;
} }
private void flush() throws IOException { private void flush() throws IOException {

View File

@ -70,7 +70,8 @@ public abstract class CompressionMode {
@Override @Override
public Compressor newCompressor() { public Compressor newCompressor() {
return new DeflateCompressor(Deflater.BEST_COMPRESSION); // 3 is the highest level that doesn't have lazy match evaluation
return new DeflateCompressor(3);
} }
@Override @Override

View File

@ -118,7 +118,7 @@ public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFor
/** Sole constructor. */ /** Sole constructor. */
public Lucene50StoredFieldsFormat() { public Lucene50StoredFieldsFormat() {
super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14); super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
} }
} }

View File

@ -36,16 +36,16 @@ public abstract class CompressingCodec extends FilterCodec {
/** /**
* Create a random instance. * Create a random instance.
*/ */
public static CompressingCodec randomInstance(Random random, int chunkSize, boolean withSegmentSuffix) { public static CompressingCodec randomInstance(Random random, int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
switch (random.nextInt(4)) { switch (random.nextInt(4)) {
case 0: case 0:
return new FastCompressingCodec(chunkSize, withSegmentSuffix); return new FastCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
case 1: case 1:
return new FastDecompressionCompressingCodec(chunkSize, withSegmentSuffix); return new FastDecompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
case 2: case 2:
return new HighCompressionCompressingCodec(chunkSize, withSegmentSuffix); return new HighCompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
case 3: case 3:
return new DummyCompressingCodec(chunkSize, withSegmentSuffix); return new DummyCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
default: default:
throw new AssertionError(); throw new AssertionError();
} }
@ -56,14 +56,14 @@ public abstract class CompressingCodec extends FilterCodec {
* suffix * suffix
*/ */
public static CompressingCodec randomInstance(Random random) { public static CompressingCodec randomInstance(Random random) {
return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), false); return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), false);
} }
/** /**
* Creates a random {@link CompressingCodec} that is using a segment suffix * Creates a random {@link CompressingCodec} that is using a segment suffix
*/ */
public static CompressingCodec randomInstance(Random random, boolean withSegmentSuffix) { public static CompressingCodec randomInstance(Random random, boolean withSegmentSuffix) {
return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), withSegmentSuffix); return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), withSegmentSuffix);
} }
private final CompressingStoredFieldsFormat storedFieldsFormat; private final CompressingStoredFieldsFormat storedFieldsFormat;
@ -72,17 +72,17 @@ public abstract class CompressingCodec extends FilterCodec {
/** /**
* Creates a compressing codec with a given segment suffix * Creates a compressing codec with a given segment suffix
*/ */
public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize) { public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
super(name, TestUtil.getDefaultCodec()); super(name, TestUtil.getDefaultCodec());
this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize); this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize, maxDocsPerChunk);
this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize); this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize);
} }
/** /**
* Creates a compressing codec with an empty segment suffix * Creates a compressing codec with an empty segment suffix
*/ */
public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize) { public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
this(name, "", compressionMode, chunkSize); this(name, "", compressionMode, chunkSize, maxDocsPerChunk);
} }
@Override @Override

View File

@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compressing;
public class FastCompressingCodec extends CompressingCodec { public class FastCompressingCodec extends CompressingCodec {
/** Constructor that allows to configure the chunk size. */ /** Constructor that allows to configure the chunk size. */
public FastCompressingCodec(int chunkSize, boolean withSegmentSuffix) { public FastCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
super("FastCompressingStoredFields", super("FastCompressingStoredFields",
withSegmentSuffix ? "FastCompressingStoredFields" : "", withSegmentSuffix ? "FastCompressingStoredFields" : "",
CompressionMode.FAST, chunkSize); CompressionMode.FAST, chunkSize, maxDocsPerChunk);
} }
/** Default constructor. */ /** Default constructor. */
public FastCompressingCodec() { public FastCompressingCodec() {
this(1 << 14, false); this(1 << 14, 128, false);
} }
} }

View File

@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compressing;
public class FastDecompressionCompressingCodec extends CompressingCodec { public class FastDecompressionCompressingCodec extends CompressingCodec {
/** Constructor that allows to configure the chunk size. */ /** Constructor that allows to configure the chunk size. */
public FastDecompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) { public FastDecompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
super("FastDecompressionCompressingStoredFields", super("FastDecompressionCompressingStoredFields",
withSegmentSuffix ? "FastDecompressionCompressingStoredFields" : "", withSegmentSuffix ? "FastDecompressionCompressingStoredFields" : "",
CompressionMode.FAST_DECOMPRESSION, chunkSize); CompressionMode.FAST_DECOMPRESSION, chunkSize, maxDocsPerChunk);
} }
/** Default constructor. */ /** Default constructor. */
public FastDecompressionCompressingCodec() { public FastDecompressionCompressingCodec() {
this(1 << 14, false); this(1 << 14, 256, false);
} }
} }

View File

@ -21,14 +21,16 @@ package org.apache.lucene.codecs.compressing;
public class HighCompressionCompressingCodec extends CompressingCodec { public class HighCompressionCompressingCodec extends CompressingCodec {
/** Constructor that allows to configure the chunk size. */ /** Constructor that allows to configure the chunk size. */
public HighCompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) { public HighCompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
super("HighCompressionCompressingStoredFields", super("HighCompressionCompressingStoredFields",
withSegmentSuffix ? "HighCompressionCompressingStoredFields" : "", withSegmentSuffix ? "HighCompressionCompressingStoredFields" : "",
CompressionMode.HIGH_COMPRESSION, chunkSize); CompressionMode.HIGH_COMPRESSION, chunkSize, maxDocsPerChunk);
} }
/** Default constructor. */ /** Default constructor. */
public HighCompressionCompressingCodec() { public HighCompressionCompressingCodec() {
this(1 << 14, false); // no need to have a higher block length than 32KB since deflate splits
// into blocks of 32KB anyway, and this is a lower bound (try to avoid > 32KB)
this(24576, 512, false);
} }
} }

View File

@ -83,15 +83,15 @@ public class DummyCompressingCodec extends CompressingCodec {
}; };
/** Constructor that allows to configure the chunk size. */ /** Constructor that allows to configure the chunk size. */
public DummyCompressingCodec(int chunkSize, boolean withSegmentSuffix) { public DummyCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
super("DummyCompressingStoredFields", super("DummyCompressingStoredFields",
withSegmentSuffix ? "DummyCompressingStoredFields" : "", withSegmentSuffix ? "DummyCompressingStoredFields" : "",
DUMMY, chunkSize); DUMMY, chunkSize, maxDocsPerChunk);
} }
/** Default constructor. */ /** Default constructor. */
public DummyCompressingCodec() { public DummyCompressingCodec() {
this(1 << 14, false); this(1 << 14, 128, false);
} }
} }