From 66745d3ce7f4658155d31220c07f351d98e153df Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 6 Dec 2014 03:30:02 +0000 Subject: [PATCH] LUCENE-5914: More options for stored fields compression git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643490 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../lucene/codecs/lucene50/Lucene50Codec.java | 35 ++++++-- .../lucene50/Lucene50StoredFieldsFormat.java | 85 +++++++++++++++++-- ...ne50StoredFieldsFormatHighCompression.java | 81 ++++++++++++++++++ .../util/TestRuleSetupAndRestoreClassEnv.java | 7 ++ 5 files changed, 197 insertions(+), 14 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3c116065a7f..6ab602f2a75 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,6 +123,9 @@ New Features queries provided that term vectors with positions, offsets, and payloads are present. This is the only highlighter that can highlight such queries accurately. (David Smiley) +* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED + or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir) + Optimizations * LUCENE-5960: Use a more efficient bitset, not a Set, to diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java index 80417ef707e..a9aa5448201 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java @@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene50; * limitations under the License. */ +import java.util.Objects; + import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; @@ -28,6 +30,7 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; * @lucene.experimental */ public class Lucene50Codec extends Codec { - private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat(); private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat(); private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat(); private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat(); @@ -62,15 +64,30 @@ public class Lucene50Codec extends Codec { return Lucene50Codec.this.getDocValuesFormatForField(field); } }; + + private final StoredFieldsFormat storedFieldsFormat; - /** Sole constructor. */ + /** + * Instantiates a new codec. + */ public Lucene50Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression + * mode to use. + * @param mode stored fields compression mode to use for newly + * flushed/merged segments. + */ + public Lucene50Codec(Mode mode) { super("Lucene50"); + this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode)); } @Override public final StoredFieldsFormat storedFieldsFormat() { - return fieldsFormat; + return storedFieldsFormat; } @Override @@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec { /** Returns the postings format that should be used for writing * new segments of field. * - * The default implementation always returns "Lucene50" + * The default implementation always returns "Lucene50". + *

+ * WARNING: if you subclass, you are responsible for index + * backwards compatibility: future version of Lucene are only + * guaranteed to be able to read the default implementation. */ public PostingsFormat getPostingsFormatForField(String field) { return defaultFormat; @@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec { /** Returns the docvalues format that should be used for writing * new segments of field. * - * The default implementation always returns "Lucene50" + * The default implementation always returns "Lucene50". + *

+ * WARNING: if you subclass, you are responsible for index + * backwards compatibility: future version of Lucene are only + * guaranteed to be able to read the default implementation. */ public DocValuesFormat getDocValuesFormatForField(String field) { return defaultDVFormat; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java index 04663c49956..8aceb194095 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java @@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene50; * limitations under the License. */ +import java.io.IOException; +import java.util.Objects; + import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat; import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter; import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; import org.apache.lucene.util.packed.PackedInts; /** * Lucene 5.0 stored fields format. * *

Principle

- *

This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in + *

This {@link StoredFieldsFormat} compresses blocks of documents in * order to improve the compression ratio compared to document-level * compression. It uses the LZ4 - * compression algorithm, which is fast to compress and very fast to decompress - * data. Although the compression method that is used focuses more on speed - * than on compression ratio, it should provide interesting compression ratios - * for redundant inputs (such as log files, HTML or plain text).

+ * compression algorithm by default in 16KB blocks, which is fast to compress + * and very fast to decompress data. Although the default compression method + * that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on + * compression ratio, it should provide interesting compression ratios + * for redundant inputs (such as log files, HTML or plain text). For higher + * compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses + * the DEFLATE algorithm with 24KB blocks + * for a better ratio at the expense of slower performance. + * These two options can be configured like this:

+ *
+ *   // the default: for high performance
+ *   indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
+ *   // instead for higher performance (but slower):
+ *   // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
+ * 
*

File formats

*

Stored fields are represented by two files:

*
    @@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.PackedInts; * larger than (231 - 214) bytes.

    * @lucene.experimental */ -public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat { - - /** Sole constructor. */ +public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat { + + /** Configuration option for stored fields. */ + public static enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED, + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION + } + + /** Attribute key for compression mode. */ + public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode"; + + final Mode mode; + + /** Stored fields format with default options */ public Lucene50StoredFieldsFormat() { - super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128); + this(Mode.BEST_SPEED); + } + + /** Stored fields format with specified mode */ + public Lucene50StoredFieldsFormat(Mode mode) { + this.mode = Objects.requireNonNull(mode); } + @Override + public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + String value = si.getAttribute(MODE_KEY); + if (value == null) { + throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name); + } + Mode mode = Mode.valueOf(value); + return impl(mode).fieldsReader(directory, si, fn, context); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { + String previous = si.putAttribute(MODE_KEY, mode.name()); + if (previous != null) { + throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name + + "old=" + previous + ", new=" + mode.name()); + } + return impl(mode).fieldsWriter(directory, si, context); + } + + StoredFieldsFormat impl(Mode mode) { + switch (mode) { + case BEST_SPEED: + return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128); + case BEST_COMPRESSION: + return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512); + default: throw new AssertionError(); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java new file mode 100644 index 00000000000..bd0ce7a02af --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java @@ -0,0 +1,81 @@ +package org.apache.lucene.codecs.lucene50; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; + +public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { + @Override + protected Codec getCodec() { + return new Lucene50Codec(Mode.BEST_COMPRESSION); + } + + /** + * Change compression params (leaving it the same for old segments) + * and tests that nothing breaks. + */ + public void testMixedCompressions() throws Exception { + Directory dir = newDirectory(); + for (int i = 0; i < 10; i++) { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setCodec(new Lucene50Codec(RandomPicks.randomFrom(random(), Mode.values()))); + IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new StoredField("field1", "value1")); + doc.add(new StoredField("field2", "value2")); + iw.addDocument(doc); + if (random().nextInt(4) == 0) { + iw.forceMerge(1); + } + iw.commit(); + iw.close(); + } + + DirectoryReader ir = DirectoryReader.open(dir); + assertEquals(10, ir.numDocs()); + ir.close(); + // checkindex + dir.close(); + } + + public void testInvalidOptions() throws Exception { + try { + new Lucene50Codec(null); + fail("didn't hit exception"); + } catch (NullPointerException expected) { + // expected + } + + try { + new Lucene50StoredFieldsFormat(null); + fail("didn't hit exception"); + } catch (NullPointerException expected) { + // expected + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index e68ecea7e1d..9e1ad64ed05 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -36,6 +36,9 @@ import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat; import org.apache.lucene.codecs.asserting.AssertingPostingsFormat; import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec; import org.apache.lucene.codecs.compressing.CompressingCodec; +import org.apache.lucene.codecs.lucene50.Lucene50Codec; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; @@ -44,7 +47,9 @@ import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.junit.internal.AssumptionViolatedException; + import com.carrotsearch.randomizedtesting.RandomizedContext; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM; import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode; @@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { codec = new AssertingCodec(); } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); + } else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) { + codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) {