LUCENE-5914: More options for stored fields compression

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643490 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-12-06 03:30:02 +00:00
parent de4c6513bc
commit 66745d3ce7
5 changed files with 197 additions and 14 deletions

View File

@ -123,6 +123,9 @@ New Features
queries provided that term vectors with positions, offsets, and payloads are present. This is the
only highlighter that can highlight such queries accurately. (David Smiley)
* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED
or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir)
Optimizations
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to

View File

@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene50;
* limitations under the License.
*/
import java.util.Objects;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
@ -28,6 +30,7 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
* @lucene.experimental
*/
public class Lucene50Codec extends Codec {
private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat();
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
@ -63,14 +65,29 @@ public class Lucene50Codec extends Codec {
}
};
/** Sole constructor. */
private final StoredFieldsFormat storedFieldsFormat;
/**
* Instantiates a new codec.
*/
public Lucene50Codec() {
this(Mode.BEST_SPEED);
}
/**
* Instantiates a new codec, specifying the stored fields compression
* mode to use.
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene50Codec(Mode mode) {
super("Lucene50");
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
return fieldsFormat;
return storedFieldsFormat;
}
@Override
@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec {
/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene50"
* The default implementation always returns "Lucene50".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
* guaranteed to be able to read the default implementation.
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec {
/** Returns the docvalues format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene50"
* The default implementation always returns "Lucene50".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
* guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;

View File

@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene50;
* limitations under the License.
*/
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
import org.apache.lucene.codecs.compressing.CompressionMode;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 5.0 stored fields format.
*
* <p><b>Principle</b></p>
* <p>This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
* <p>This {@link StoredFieldsFormat} compresses blocks of documents in
* order to improve the compression ratio compared to document-level
* compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a>
* compression algorithm, which is fast to compress and very fast to decompress
* data. Although the compression method that is used focuses more on speed
* than on compression ratio, it should provide interesting compression ratios
* for redundant inputs (such as log files, HTML or plain text).</p>
* compression algorithm by default in 16KB blocks, which is fast to compress
* and very fast to decompress data. Although the default compression method
* that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on
* compression ratio, it should provide interesting compression ratios
* for redundant inputs (such as log files, HTML or plain text). For higher
* compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses
* the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 24KB blocks
* for a better ratio at the expense of slower performance.
* These two options can be configured like this: </p>
* <pre class="prettyprint">
* // the default: for high performance
* indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
* // instead for higher performance (but slower):
* // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
* </pre>
* <p><b>File formats</b></p>
* <p>Stored fields are represented by two files:</p>
* <ol>
@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.PackedInts;
* larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.</p>
* @lucene.experimental
*/
public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat {
public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
/** Sole constructor. */
public Lucene50StoredFieldsFormat() {
super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
/** Configuration option for stored fields. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED,
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION
}
/** Attribute key for compression mode. */
public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode";
final Mode mode;
/** Stored fields format with default options */
public Lucene50StoredFieldsFormat() {
this(Mode.BEST_SPEED);
}
/** Stored fields format with specified mode */
public Lucene50StoredFieldsFormat(Mode mode) {
this.mode = Objects.requireNonNull(mode);
}
@Override
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
String value = si.getAttribute(MODE_KEY);
if (value == null) {
throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name);
}
Mode mode = Mode.valueOf(value);
return impl(mode).fieldsReader(directory, si, fn, context);
}
@Override
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
String previous = si.putAttribute(MODE_KEY, mode.name());
if (previous != null) {
throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
"old=" + previous + ", new=" + mode.name());
}
return impl(mode).fieldsWriter(directory, si, context);
}
StoredFieldsFormat impl(Mode mode) {
switch (mode) {
case BEST_SPEED:
return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128);
case BEST_COMPRESSION:
return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512);
default: throw new AssertionError();
}
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.codecs.lucene50;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene50Codec(Mode.BEST_COMPRESSION);
}
/**
* Change compression params (leaving it the same for old segments)
* and tests that nothing breaks.
*/
public void testMixedCompressions() throws Exception {
Directory dir = newDirectory();
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(new Lucene50Codec(RandomPicks.randomFrom(random(), Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
doc.add(new StoredField("field2", "value2"));
iw.addDocument(doc);
if (random().nextInt(4) == 0) {
iw.forceMerge(1);
}
iw.commit();
iw.close();
}
DirectoryReader ir = DirectoryReader.open(dir);
assertEquals(10, ir.numDocs());
ir.close();
// checkindex
dir.close();
}
public void testInvalidOptions() throws Exception {
try {
new Lucene50Codec(null);
fail("didn't hit exception");
} catch (NullPointerException expected) {
// expected
}
try {
new Lucene50StoredFieldsFormat(null);
fail("didn't hit exception");
} catch (NullPointerException expected) {
// expected
}
}
}

View File

@ -36,6 +36,9 @@ import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat;
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
@ -44,7 +47,9 @@ import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.junit.internal.AssumptionViolatedException;
import com.carrotsearch.randomizedtesting.RandomizedContext;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
} else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {