From 834c63b9323e20da1b647edf22f323271df0a596 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 2 Dec 2014 18:20:03 +0000 Subject: [PATCH] LUCENE-6089: Tune CompressionMode.HIGH_COMPRESSION git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1642981 13f79535-47bb-0310-9956-ffa450edef68 --- .../CompressingStoredFieldsFormat.java | 21 ++++++++++++------ .../CompressingStoredFieldsWriter.java | 9 ++++---- .../codecs/compressing/CompressionMode.java | 3 ++- .../lucene50/Lucene50StoredFieldsFormat.java | 2 +- .../codecs/compressing/CompressingCodec.java | 22 +++++++++---------- .../compressing/FastCompressingCodec.java | 6 ++--- .../FastDecompressionCompressingCodec.java | 6 ++--- .../HighCompressionCompressingCodec.java | 8 ++++--- .../dummy/DummyCompressingCodec.java | 6 ++--- 9 files changed, 46 insertions(+), 37 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java index 9aad0b77888..d9f14b028a9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java @@ -48,15 +48,16 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { private final String segmentSuffix; private final CompressionMode compressionMode; private final int chunkSize; + private final int maxDocsPerChunk; /** * Create a new {@link CompressingStoredFieldsFormat} with an empty segment * suffix. * - * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int) + * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int, int) */ - public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize) { - this(formatName, "", compressionMode, chunkSize); + public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) { + this(formatName, "", compressionMode, chunkSize, maxDocsPerChunk); } /** @@ -79,6 +80,8 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { * chunkSize is the minimum byte size of a chunk of documents. * A value of 1 can make sense if there is redundancy across * fields. + * maxDocsPerChunk is an upperbound on how many docs may be stored + * in a single chunk. This is to bound the cpu costs for highly compressible data. *

* Higher values of chunkSize should improve the compression * ratio but will require more memory at indexing time and might make document @@ -88,10 +91,11 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { * @param formatName the name of the {@link StoredFieldsFormat} * @param compressionMode the {@link CompressionMode} to use * @param chunkSize the minimum number of bytes of a single chunk of stored documents + * @param maxDocsPerChunk the maximum number of documents in a single chunk * @see CompressionMode */ public CompressingStoredFieldsFormat(String formatName, String segmentSuffix, - CompressionMode compressionMode, int chunkSize) { + CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) { this.formatName = formatName; this.segmentSuffix = segmentSuffix; this.compressionMode = compressionMode; @@ -99,7 +103,10 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { throw new IllegalArgumentException("chunkSize must be >= 1"); } this.chunkSize = chunkSize; - + if (maxDocsPerChunk < 1) { + throw new IllegalArgumentException("maxDocsPerChunk must be >= 1"); + } + this.maxDocsPerChunk = maxDocsPerChunk; } @Override @@ -113,13 +120,13 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { return new CompressingStoredFieldsWriter(directory, si, segmentSuffix, context, - formatName, compressionMode, chunkSize); + formatName, compressionMode, chunkSize, maxDocsPerChunk); } @Override public String toString() { return getClass().getSimpleName() + "(compressionMode=" + compressionMode - + ", chunkSize=" + chunkSize + ")"; + + ", chunkSize=" + chunkSize + ", maxDocsPerChunk=" + maxDocsPerChunk + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index d193e332940..2aa507ef171 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -54,9 +54,6 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { /** Extension of stored fields index file */ public static final String FIELDS_INDEX_EXTENSION = "fdx"; - - // hard limit on the maximum number of documents per chunk - static final int MAX_DOCUMENTS_PER_CHUNK = 128; static final int STRING = 0x00; static final int BYTE_ARR = 0x01; @@ -82,6 +79,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { private final CompressionMode compressionMode; private final Compressor compressor; private final int chunkSize; + private final int maxDocsPerChunk; private final GrowableByteArrayDataOutput bufferedDocs; private int[] numStoredFields; // number of stored fields @@ -91,7 +89,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { /** Sole constructor. */ public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context, - String formatName, CompressionMode compressionMode, int chunkSize) throws IOException { + String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException { assert directory != null; this.directory = directory; this.segment = si.name; @@ -99,6 +97,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { this.compressionMode = compressionMode; this.compressor = compressionMode.newCompressor(); this.chunkSize = chunkSize; + this.maxDocsPerChunk = maxDocsPerChunk; this.docBase = 0; this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize); this.numStoredFields = new int[16]; @@ -210,7 +209,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { private boolean triggerFlush() { return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes - numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK; + numBufferedDocs >= maxDocsPerChunk; } private void flush() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java index baa31e1a6ea..600fd3c1ec6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java @@ -70,7 +70,8 @@ public abstract class CompressionMode { @Override public Compressor newCompressor() { - return new DeflateCompressor(Deflater.BEST_COMPRESSION); + // 3 is the highest level that doesn't have lazy match evaluation + return new DeflateCompressor(3); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java index de6382ebd43..04663c49956 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java @@ -118,7 +118,7 @@ public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFor /** Sole constructor. */ public Lucene50StoredFieldsFormat() { - super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14); + super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java index dd3c9335df7..2eba65e6902 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java @@ -36,16 +36,16 @@ public abstract class CompressingCodec extends FilterCodec { /** * Create a random instance. */ - public static CompressingCodec randomInstance(Random random, int chunkSize, boolean withSegmentSuffix) { + public static CompressingCodec randomInstance(Random random, int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) { switch (random.nextInt(4)) { case 0: - return new FastCompressingCodec(chunkSize, withSegmentSuffix); + return new FastCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix); case 1: - return new FastDecompressionCompressingCodec(chunkSize, withSegmentSuffix); + return new FastDecompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix); case 2: - return new HighCompressionCompressingCodec(chunkSize, withSegmentSuffix); + return new HighCompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix); case 3: - return new DummyCompressingCodec(chunkSize, withSegmentSuffix); + return new DummyCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix); default: throw new AssertionError(); } @@ -56,14 +56,14 @@ public abstract class CompressingCodec extends FilterCodec { * suffix */ public static CompressingCodec randomInstance(Random random) { - return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), false); + return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), false); } /** * Creates a random {@link CompressingCodec} that is using a segment suffix */ public static CompressingCodec randomInstance(Random random, boolean withSegmentSuffix) { - return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), withSegmentSuffix); + return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), withSegmentSuffix); } private final CompressingStoredFieldsFormat storedFieldsFormat; @@ -72,17 +72,17 @@ public abstract class CompressingCodec extends FilterCodec { /** * Creates a compressing codec with a given segment suffix */ - public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize) { + public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) { super(name, TestUtil.getDefaultCodec()); - this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize); + this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize, maxDocsPerChunk); this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize); } /** * Creates a compressing codec with an empty segment suffix */ - public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize) { - this(name, "", compressionMode, chunkSize); + public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) { + this(name, "", compressionMode, chunkSize, maxDocsPerChunk); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java index c8e2d06c991..80c46f5d203 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java @@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compressing; public class FastCompressingCodec extends CompressingCodec { /** Constructor that allows to configure the chunk size. */ - public FastCompressingCodec(int chunkSize, boolean withSegmentSuffix) { + public FastCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) { super("FastCompressingStoredFields", withSegmentSuffix ? "FastCompressingStoredFields" : "", - CompressionMode.FAST, chunkSize); + CompressionMode.FAST, chunkSize, maxDocsPerChunk); } /** Default constructor. */ public FastCompressingCodec() { - this(1 << 14, false); + this(1 << 14, 128, false); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java index 6b9ae054f2c..403eea317a7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java @@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compressing; public class FastDecompressionCompressingCodec extends CompressingCodec { /** Constructor that allows to configure the chunk size. */ - public FastDecompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) { + public FastDecompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) { super("FastDecompressionCompressingStoredFields", withSegmentSuffix ? "FastDecompressionCompressingStoredFields" : "", - CompressionMode.FAST_DECOMPRESSION, chunkSize); + CompressionMode.FAST_DECOMPRESSION, chunkSize, maxDocsPerChunk); } /** Default constructor. */ public FastDecompressionCompressingCodec() { - this(1 << 14, false); + this(1 << 14, 256, false); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java index 532122c30e0..5f369bff5c2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java @@ -21,14 +21,16 @@ package org.apache.lucene.codecs.compressing; public class HighCompressionCompressingCodec extends CompressingCodec { /** Constructor that allows to configure the chunk size. */ - public HighCompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) { + public HighCompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) { super("HighCompressionCompressingStoredFields", withSegmentSuffix ? "HighCompressionCompressingStoredFields" : "", - CompressionMode.HIGH_COMPRESSION, chunkSize); + CompressionMode.HIGH_COMPRESSION, chunkSize, maxDocsPerChunk); } /** Default constructor. */ public HighCompressionCompressingCodec() { - this(1 << 14, false); + // no need to have a higher block length than 32KB since deflate splits + // into blocks of 32KB anyway, and this is a lower bound (try to avoid > 32KB) + this(24576, 512, false); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java index 9e1649dc79e..9a887940d1f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java @@ -83,15 +83,15 @@ public class DummyCompressingCodec extends CompressingCodec { }; /** Constructor that allows to configure the chunk size. */ - public DummyCompressingCodec(int chunkSize, boolean withSegmentSuffix) { + public DummyCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) { super("DummyCompressingStoredFields", withSegmentSuffix ? "DummyCompressingStoredFields" : "", - DUMMY, chunkSize); + DUMMY, chunkSize, maxDocsPerChunk); } /** Default constructor. */ public DummyCompressingCodec() { - this(1 << 14, false); + this(1 << 14, 128, false); } }