From 66745d3ce7f4658155d31220c07f351d98e153df Mon Sep 17 00:00:00 2001
From: Robert Muir
Date: Sat, 6 Dec 2014 03:30:02 +0000
Subject: [PATCH] LUCENE-5914: More options for stored fields compression
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643490 13f79535-47bb-0310-9956-ffa450edef68
---
lucene/CHANGES.txt | 3 +
.../lucene/codecs/lucene50/Lucene50Codec.java | 35 ++++++--
.../lucene50/Lucene50StoredFieldsFormat.java | 85 +++++++++++++++++--
...ne50StoredFieldsFormatHighCompression.java | 81 ++++++++++++++++++
.../util/TestRuleSetupAndRestoreClassEnv.java | 7 ++
5 files changed, 197 insertions(+), 14 deletions(-)
create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3c116065a7f..6ab602f2a75 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -123,6 +123,9 @@ New Features
queries provided that term vectors with positions, offsets, and payloads are present. This is the
only highlighter that can highlight such queries accurately. (David Smiley)
+* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED
+ or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir)
+
Optimizations
* LUCENE-5960: Use a more efficient bitset, not a Set, to
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
index 80417ef707e..a9aa5448201 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
@@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene50;
* limitations under the License.
*/
+import java.util.Objects;
+
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
@@ -28,6 +30,7 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
* @lucene.experimental
*/
public class Lucene50Codec extends Codec {
- private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat();
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
@@ -62,15 +64,30 @@ public class Lucene50Codec extends Codec {
return Lucene50Codec.this.getDocValuesFormatForField(field);
}
};
+
+ private final StoredFieldsFormat storedFieldsFormat;
- /** Sole constructor. */
+ /**
+ * Instantiates a new codec.
+ */
public Lucene50Codec() {
+ this(Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression
+ * mode to use.
+ * @param mode stored fields compression mode to use for newly
+ * flushed/merged segments.
+ */
+ public Lucene50Codec(Mode mode) {
super("Lucene50");
+ this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
- return fieldsFormat;
+ return storedFieldsFormat;
}
@Override
@@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec {
/** Returns the postings format that should be used for writing
* new segments of field
.
*
- * The default implementation always returns "Lucene50"
+ * The default implementation always returns "Lucene50".
+ *
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
@@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec {
/** Returns the docvalues format that should be used for writing
* new segments of field
.
*
- * The default implementation always returns "Lucene50"
+ * The default implementation always returns "Lucene50".
+ *
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
index 04663c49956..8aceb194095 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
@@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene50;
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.Objects;
+
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 5.0 stored fields format.
*
*
Principle
- * This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
+ *
This {@link StoredFieldsFormat} compresses blocks of documents in
* order to improve the compression ratio compared to document-level
* compression. It uses the LZ4
- * compression algorithm, which is fast to compress and very fast to decompress
- * data. Although the compression method that is used focuses more on speed
- * than on compression ratio, it should provide interesting compression ratios
- * for redundant inputs (such as log files, HTML or plain text).
+ * compression algorithm by default in 16KB blocks, which is fast to compress
+ * and very fast to decompress data. Although the default compression method
+ * that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on
+ * compression ratio, it should provide interesting compression ratios
+ * for redundant inputs (such as log files, HTML or plain text). For higher
+ * compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses
+ * the DEFLATE algorithm with 24KB blocks
+ * for a better ratio at the expense of slower performance.
+ * These two options can be configured like this:
+ *
+ * // the default: for high performance
+ * indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
+ * // instead for higher performance (but slower):
+ * // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
+ *
* File formats
* Stored fields are represented by two files:
*
@@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.PackedInts;
* larger than (231 - 214) bytes.
* @lucene.experimental
*/
-public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat {
-
- /** Sole constructor. */
+public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
+
+ /** Configuration option for stored fields. */
+ public static enum Mode {
+ /** Trade compression ratio for retrieval speed. */
+ BEST_SPEED,
+ /** Trade retrieval speed for compression ratio. */
+ BEST_COMPRESSION
+ }
+
+ /** Attribute key for compression mode. */
+ public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode";
+
+ final Mode mode;
+
+ /** Stored fields format with default options */
public Lucene50StoredFieldsFormat() {
- super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
+ this(Mode.BEST_SPEED);
+ }
+
+ /** Stored fields format with specified mode */
+ public Lucene50StoredFieldsFormat(Mode mode) {
+ this.mode = Objects.requireNonNull(mode);
}
+ @Override
+ public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
+ String value = si.getAttribute(MODE_KEY);
+ if (value == null) {
+ throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name);
+ }
+ Mode mode = Mode.valueOf(value);
+ return impl(mode).fieldsReader(directory, si, fn, context);
+ }
+
+ @Override
+ public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
+ String previous = si.putAttribute(MODE_KEY, mode.name());
+ if (previous != null) {
+ throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
+ "old=" + previous + ", new=" + mode.name());
+ }
+ return impl(mode).fieldsWriter(directory, si, context);
+ }
+
+ StoredFieldsFormat impl(Mode mode) {
+ switch (mode) {
+ case BEST_SPEED:
+ return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128);
+ case BEST_COMPRESSION:
+ return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512);
+ default: throw new AssertionError();
+ }
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
new file mode 100644
index 00000000000..bd0ce7a02af
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
@@ -0,0 +1,81 @@
+package org.apache.lucene.codecs.lucene50;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
+public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
+ @Override
+ protected Codec getCodec() {
+ return new Lucene50Codec(Mode.BEST_COMPRESSION);
+ }
+
+ /**
+ * Change compression params (leaving it the same for old segments)
+ * and tests that nothing breaks.
+ */
+ public void testMixedCompressions() throws Exception {
+ Directory dir = newDirectory();
+ for (int i = 0; i < 10; i++) {
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setCodec(new Lucene50Codec(RandomPicks.randomFrom(random(), Mode.values())));
+ IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
+ Document doc = new Document();
+ doc.add(new StoredField("field1", "value1"));
+ doc.add(new StoredField("field2", "value2"));
+ iw.addDocument(doc);
+ if (random().nextInt(4) == 0) {
+ iw.forceMerge(1);
+ }
+ iw.commit();
+ iw.close();
+ }
+
+ DirectoryReader ir = DirectoryReader.open(dir);
+ assertEquals(10, ir.numDocs());
+ ir.close();
+ // checkindex
+ dir.close();
+ }
+
+ public void testInvalidOptions() throws Exception {
+ try {
+ new Lucene50Codec(null);
+ fail("didn't hit exception");
+ } catch (NullPointerException expected) {
+ // expected
+ }
+
+ try {
+ new Lucene50StoredFieldsFormat(null);
+ fail("didn't hit exception");
+ } catch (NullPointerException expected) {
+ // expected
+ }
+ }
+}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
index e68ecea7e1d..9e1ad64ed05 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
@@ -36,6 +36,9 @@ import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat;
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
+import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
@@ -44,7 +47,9 @@ import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.junit.internal.AssumptionViolatedException;
+
import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
@@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
+ } else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
+ codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {