mirror of https://github.com/apache/lucene.git
LUCENE-5914: More options for stored fields compression
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643490 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
de4c6513bc
commit
66745d3ce7
|
@ -123,6 +123,9 @@ New Features
|
|||
queries provided that term vectors with positions, offsets, and payloads are present. This is the
|
||||
only highlighter that can highlight such queries accurately. (David Smiley)
|
||||
|
||||
* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED
|
||||
or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
|
||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene50;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -28,6 +30,7 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
|
@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene50Codec extends Codec {
|
||||
private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat();
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
|
||||
|
@ -63,14 +65,29 @@ public class Lucene50Codec extends Codec {
|
|||
}
|
||||
};
|
||||
|
||||
/** Sole constructor. */
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/**
|
||||
* Instantiates a new codec.
|
||||
*/
|
||||
public Lucene50Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression
|
||||
* mode to use.
|
||||
* @param mode stored fields compression mode to use for newly
|
||||
* flushed/merged segments.
|
||||
*/
|
||||
public Lucene50Codec(Mode mode) {
|
||||
super("Lucene50");
|
||||
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return fieldsFormat;
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec {
|
|||
/** Returns the postings format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50"
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultFormat;
|
||||
|
@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec {
|
|||
/** Returns the docvalues format that should be used for writing
|
||||
* new segments of <code>field</code>.
|
||||
*
|
||||
* The default implementation always returns "Lucene50"
|
||||
* The default implementation always returns "Lucene50".
|
||||
* <p>
|
||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||
* backwards compatibility: future version of Lucene are only
|
||||
* guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
|
|
|
@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene50;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.StoredFieldsWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
|
||||
import org.apache.lucene.codecs.compressing.CompressionMode;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Lucene 5.0 stored fields format.
|
||||
*
|
||||
* <p><b>Principle</b></p>
|
||||
* <p>This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
|
||||
* <p>This {@link StoredFieldsFormat} compresses blocks of documents in
|
||||
* order to improve the compression ratio compared to document-level
|
||||
* compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a>
|
||||
* compression algorithm, which is fast to compress and very fast to decompress
|
||||
* data. Although the compression method that is used focuses more on speed
|
||||
* than on compression ratio, it should provide interesting compression ratios
|
||||
* for redundant inputs (such as log files, HTML or plain text).</p>
|
||||
* compression algorithm by default in 16KB blocks, which is fast to compress
|
||||
* and very fast to decompress data. Although the default compression method
|
||||
* that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on
|
||||
* compression ratio, it should provide interesting compression ratios
|
||||
* for redundant inputs (such as log files, HTML or plain text). For higher
|
||||
* compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses
|
||||
* the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 24KB blocks
|
||||
* for a better ratio at the expense of slower performance.
|
||||
* These two options can be configured like this: </p>
|
||||
* <pre class="prettyprint">
|
||||
* // the default: for high performance
|
||||
* indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
|
||||
* // instead for higher performance (but slower):
|
||||
* // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
|
||||
* </pre>
|
||||
* <p><b>File formats</b></p>
|
||||
* <p>Stored fields are represented by two files:</p>
|
||||
* <ol>
|
||||
|
@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat {
|
||||
public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene50StoredFieldsFormat() {
|
||||
super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
|
||||
/** Configuration option for stored fields. */
|
||||
public static enum Mode {
|
||||
/** Trade compression ratio for retrieval speed. */
|
||||
BEST_SPEED,
|
||||
/** Trade retrieval speed for compression ratio. */
|
||||
BEST_COMPRESSION
|
||||
}
|
||||
|
||||
/** Attribute key for compression mode. */
|
||||
public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode";
|
||||
|
||||
final Mode mode;
|
||||
|
||||
/** Stored fields format with default options */
|
||||
public Lucene50StoredFieldsFormat() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/** Stored fields format with specified mode */
|
||||
public Lucene50StoredFieldsFormat(Mode mode) {
|
||||
this.mode = Objects.requireNonNull(mode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
|
||||
String value = si.getAttribute(MODE_KEY);
|
||||
if (value == null) {
|
||||
throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name);
|
||||
}
|
||||
Mode mode = Mode.valueOf(value);
|
||||
return impl(mode).fieldsReader(directory, si, fn, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
|
||||
String previous = si.putAttribute(MODE_KEY, mode.name());
|
||||
if (previous != null) {
|
||||
throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
|
||||
"old=" + previous + ", new=" + mode.name());
|
||||
}
|
||||
return impl(mode).fieldsWriter(directory, si, context);
|
||||
}
|
||||
|
||||
StoredFieldsFormat impl(Mode mode) {
|
||||
switch (mode) {
|
||||
case BEST_SPEED:
|
||||
return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128);
|
||||
case BEST_COMPRESSION:
|
||||
return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512);
|
||||
default: throw new AssertionError();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
package org.apache.lucene.codecs.lucene50;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene50Codec(Mode.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Change compression params (leaving it the same for old segments)
|
||||
* and tests that nothing breaks.
|
||||
*/
|
||||
public void testMixedCompressions() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
for (int i = 0; i < 10; i++) {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||
iwc.setCodec(new Lucene50Codec(RandomPicks.randomFrom(random(), Mode.values())));
|
||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
|
||||
Document doc = new Document();
|
||||
doc.add(new StoredField("field1", "value1"));
|
||||
doc.add(new StoredField("field2", "value2"));
|
||||
iw.addDocument(doc);
|
||||
if (random().nextInt(4) == 0) {
|
||||
iw.forceMerge(1);
|
||||
}
|
||||
iw.commit();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
DirectoryReader ir = DirectoryReader.open(dir);
|
||||
assertEquals(10, ir.numDocs());
|
||||
ir.close();
|
||||
// checkindex
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testInvalidOptions() throws Exception {
|
||||
try {
|
||||
new Lucene50Codec(null);
|
||||
fail("didn't hit exception");
|
||||
} catch (NullPointerException expected) {
|
||||
// expected
|
||||
}
|
||||
|
||||
try {
|
||||
new Lucene50StoredFieldsFormat(null);
|
||||
fail("didn't hit exception");
|
||||
} catch (NullPointerException expected) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
}
|
|
@ -36,6 +36,9 @@ import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat;
|
|||
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
|
||||
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
|
||||
import org.apache.lucene.codecs.compressing.CompressingCodec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||
import org.apache.lucene.index.RandomCodec;
|
||||
|
@ -44,7 +47,9 @@ import org.apache.lucene.search.similarities.DefaultSimilarity;
|
|||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.junit.internal.AssumptionViolatedException;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.RandomizedContext;
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
|
||||
import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
|
||||
|
@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
codec = new AssertingCodec();
|
||||
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
|
||||
codec = CompressingCodec.randomInstance(random);
|
||||
} else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
|
||||
codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
|
||||
} else if (!"random".equals(TEST_CODEC)) {
|
||||
codec = Codec.forName(TEST_CODEC);
|
||||
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||
|
|
Loading…
Reference in New Issue