From 144ef2a0c054b54ee533f5618f36651931825f7d Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Tue, 9 Mar 2021 08:11:59 +0100 Subject: [PATCH] LUCENE-9705: Create Lucene90StoredFieldsFormat (#2444) --- .../lucene50/Lucene50StoredFieldsFormat.java | 6 +- .../lucene50/Lucene50TermVectorsFormat.java | 5 +- ...Lucene50CompressingStoredFieldsFormat.java | 162 ++++ ...ucene50CompressingStoredFieldsReader.java} | 76 +- .../lucene87/BugfixDeflater_JDK8252739.java | 109 +++ .../DeflateWithPresetDictCompressionMode.java | 230 ++++++ .../LZ4WithPresetDictCompressionMode.java | 198 +++++ .../lucene87/Lucene87Codec.java | 22 +- .../lucene87/Lucene87StoredFieldsFormat.java | 28 +- .../Lucene50RWStoredFieldsFormat.java | 17 + ...Lucene50CompressingStoredFieldsWriter.java | 736 +++++++++++++++++ ...cene50RWCompressingStoredFieldsFormat.java | 52 ++ .../lucene87/Lucene87RWCodec.java | 16 + .../Lucene87RWStoredFieldsFormat.java | 71 ++ .../TestLucene87StoredFieldsFormat.java | 5 +- ...ne87StoredFieldsFormatHighCompression.java | 11 +- ...cene87StoredFieldsFormatMergeInstance.java | 4 +- .../compressing/LegacyFieldsIndexReader.java | 238 ------ .../codecs/compressing/package-info.java | 2 +- .../BugfixDeflater_JDK8252739.java | 4 +- .../DeflateWithPresetDictCompressionMode.java | 2 +- .../LZ4WithPresetDictCompressionMode.java | 2 +- .../lucene/codecs/lucene90/Lucene90Codec.java | 11 +- .../lucene90/Lucene90StoredFieldsFormat.java | 186 +++++ .../lucene90/Lucene90TermVectorsFormat.java | 7 +- .../compressing/FieldsIndex.java | 2 +- .../compressing/FieldsIndexReader.java | 6 +- .../compressing/FieldsIndexWriter.java | 2 +- ...ucene90CompressingStoredFieldsFormat.java} | 24 +- ...Lucene90CompressingStoredFieldsReader.java | 756 ++++++++++++++++++ ...ucene90CompressingStoredFieldsWriter.java} | 41 +- .../Lucene90CompressingTermVectorsFormat.java | 3 +- .../Lucene90CompressingTermVectorsReader.java | 28 +- .../Lucene90CompressingTermVectorsWriter.java | 5 +- .../compressing}/package-info.java | 7 +- .../lucene/codecs/lucene90/package-info.java | 6 +- .../index/SortingStoredFieldsConsumer.java | 5 +- .../index/SortingTermVectorsConsumer.java | 2 +- .../TestLucene90StoredFieldsFormat.java | 28 + ...ne90StoredFieldsFormatHighCompression.java | 85 ++ ...cene90StoredFieldsFormatMergeInstance.java | 26 + .../lucene/store/TestFileSwitchDirectory.java | 4 +- .../codecs/compressing/CompressingCodec.java | 10 +- .../DeflateWithPresetCompressingCodec.java | 2 +- .../LZ4WithPresetCompressingCodec.java | 2 +- .../lucene/codecs/compressing/package.html | 2 +- .../TestCompressingStoredFieldsFormat.java | 45 +- .../TestCompressingTermVectorsFormat.java | 3 +- .../apache/solr/core/TestCodecSupport.java | 6 +- 49 files changed, 2878 insertions(+), 422 deletions(-) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsFormat.java rename lucene/{core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java => backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java} (90%) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/BugfixDeflater_JDK8252739.java create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/DeflateWithPresetDictCompressionMode.java create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/LZ4WithPresetDictCompressionMode.java rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene87/Lucene87StoredFieldsFormat.java (89%) create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsWriter.java create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingStoredFieldsFormat.java create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWStoredFieldsFormat.java rename lucene/{core/src/test/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene87/TestLucene87StoredFieldsFormat.java (89%) rename lucene/{core/src/test/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene87/TestLucene87StoredFieldsFormatHighCompression.java (89%) rename lucene/{core/src/test/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java (89%) delete mode 100644 lucene/core/src/java/org/apache/lucene/codecs/compressing/LegacyFieldsIndexReader.java rename lucene/core/src/java/org/apache/lucene/codecs/{lucene87 => lucene90}/BugfixDeflater_JDK8252739.java (98%) rename lucene/core/src/java/org/apache/lucene/codecs/{lucene87 => lucene90}/DeflateWithPresetDictCompressionMode.java (99%) rename lucene/core/src/java/org/apache/lucene/codecs/{lucene87 => lucene90}/LZ4WithPresetDictCompressionMode.java (99%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java rename lucene/core/src/java/org/apache/lucene/codecs/{ => lucene90}/compressing/FieldsIndex.java (95%) rename lucene/core/src/java/org/apache/lucene/codecs/{ => lucene90}/compressing/FieldsIndexReader.java (95%) rename lucene/core/src/java/org/apache/lucene/codecs/{ => lucene90}/compressing/FieldsIndexWriter.java (99%) rename lucene/core/src/java/org/apache/lucene/codecs/{compressing/CompressingStoredFieldsFormat.java => lucene90/compressing/Lucene90CompressingStoredFieldsFormat.java} (87%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java rename lucene/core/src/java/org/apache/lucene/codecs/{compressing/CompressingStoredFieldsWriter.java => lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java} (94%) rename lucene/core/src/java/org/apache/lucene/codecs/{ => lucene90}/compressing/Lucene90CompressingTermVectorsFormat.java (97%) rename lucene/core/src/java/org/apache/lucene/codecs/{ => lucene90}/compressing/Lucene90CompressingTermVectorsReader.java (96%) rename lucene/core/src/java/org/apache/lucene/codecs/{ => lucene90}/compressing/Lucene90CompressingTermVectorsWriter.java (99%) rename lucene/core/src/java/org/apache/lucene/codecs/{lucene87 => lucene90/compressing}/package-info.java (80%) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormat.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatMergeInstance.java rename lucene/test-framework/src/test/org/apache/lucene/codecs/{ => lucene90}/compressing/TestCompressingStoredFieldsFormat.java (86%) rename lucene/test-framework/src/test/org/apache/lucene/codecs/{ => lucene90}/compressing/TestCompressingTermVectorsFormat.java (97%) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50StoredFieldsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50StoredFieldsFormat.java index f34e2f4d533..fec31425428 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50StoredFieldsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50StoredFieldsFormat.java @@ -18,10 +18,10 @@ package org.apache.lucene.backward_codecs.lucene50; import java.io.IOException; import java.util.Objects; +import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; -import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat; import org.apache.lucene.codecs.compressing.CompressionMode; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentInfo; @@ -147,10 +147,10 @@ public class Lucene50StoredFieldsFormat extends StoredFieldsFormat { StoredFieldsFormat impl(Mode mode) { switch (mode) { case BEST_SPEED: - return new CompressingStoredFieldsFormat( + return new Lucene50CompressingStoredFieldsFormat( "Lucene50StoredFieldsFastData", CompressionMode.FAST, 1 << 14, 128, 10); case BEST_COMPRESSION: - return new CompressingStoredFieldsFormat( + return new Lucene50CompressingStoredFieldsFormat( "Lucene50StoredFieldsHighData", CompressionMode.HIGH_COMPRESSION, 61440, 512, 10); default: throw new AssertionError(); diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java index 1771e7fc89c..c9c676500de 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50TermVectorsFormat.java @@ -17,11 +17,10 @@ package org.apache.lucene.backward_codecs.lucene50; import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingTermVectorsFormat; +import org.apache.lucene.backward_codecs.lucene87.Lucene87StoredFieldsFormat; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.compressing.CompressionMode; -import org.apache.lucene.codecs.compressing.FieldsIndexWriter; -import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; @@ -144,7 +143,7 @@ import org.apache.lucene.util.packed.PackedInts; * * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsFormat.java new file mode 100644 index 00000000000..982301b4a70 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsFormat.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene50.compressing; + +import java.io.IOException; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.packed.DirectMonotonicWriter; + +/** + * A {@link StoredFieldsFormat} that compresses documents in chunks in order to improve the + * compression ratio. + * + *

For a chunk size of chunkSize bytes, this {@link StoredFieldsFormat} does not + * support documents larger than (231 - chunkSize) bytes. + * + *

For optimal performance, you should use a {@link MergePolicy} that returns segments that have + * the biggest byte size first. + * + * @lucene.experimental + */ +public class Lucene50CompressingStoredFieldsFormat extends StoredFieldsFormat { + + /** format name */ + protected final String formatName; + /** segment suffix */ + protected final String segmentSuffix; + /** compression mode */ + protected final CompressionMode compressionMode; + /** chunk size */ + protected final int chunkSize; + /** max docs per chunk */ + protected final int maxDocsPerChunk; + /** block shift */ + protected final int blockShift; + + /** + * Create a new {@link Lucene50CompressingStoredFieldsFormat} with an empty segment suffix. + * + * @see Lucene50CompressingStoredFieldsFormat#Lucene50CompressingStoredFieldsFormat(String, + * String, CompressionMode, int, int, int) + */ + public Lucene50CompressingStoredFieldsFormat( + String formatName, + CompressionMode compressionMode, + int chunkSize, + int maxDocsPerChunk, + int blockShift) { + this(formatName, "", compressionMode, chunkSize, maxDocsPerChunk, blockShift); + } + + /** + * Create a new {@link Lucene50CompressingStoredFieldsFormat}. + * + *

formatName is the name of the format. This name will be used in the file + * formats to perform {@link CodecUtil#checkIndexHeader codec header checks}. + * + *

segmentSuffix is the segment suffix. This suffix is added to the result file + * name only if it's not the empty string. + * + *

The compressionMode parameter allows you to choose between compression + * algorithms that have various compression and decompression speeds so that you can pick the one + * that best fits your indexing and searching throughput. You should never instantiate two {@link + * Lucene50CompressingStoredFieldsFormat}s that have the same name but different {@link + * CompressionMode}s. + * + *

chunkSize is the minimum byte size of a chunk of documents. A value of 1 + * can make sense if there is redundancy across fields. maxDocsPerChunk is an + * upperbound on how many docs may be stored in a single chunk. This is to bound the cpu costs for + * highly compressible data. + * + *

Higher values of chunkSize should improve the compression ratio but will + * require more memory at indexing time and might make document loading a little slower (depending + * on the size of your OS cache compared to the size of your index). + * + * @param formatName the name of the {@link StoredFieldsFormat} + * @param compressionMode the {@link CompressionMode} to use + * @param chunkSize the minimum number of bytes of a single chunk of stored documents + * @param maxDocsPerChunk the maximum number of documents in a single chunk + * @param blockShift the log in base 2 of number of chunks to store in an index block + * @see CompressionMode + */ + public Lucene50CompressingStoredFieldsFormat( + String formatName, + String segmentSuffix, + CompressionMode compressionMode, + int chunkSize, + int maxDocsPerChunk, + int blockShift) { + this.formatName = formatName; + this.segmentSuffix = segmentSuffix; + this.compressionMode = compressionMode; + if (chunkSize < 1) { + throw new IllegalArgumentException("chunkSize must be >= 1"); + } + this.chunkSize = chunkSize; + if (maxDocsPerChunk < 1) { + throw new IllegalArgumentException("maxDocsPerChunk must be >= 1"); + } + this.maxDocsPerChunk = maxDocsPerChunk; + if (blockShift < DirectMonotonicWriter.MIN_BLOCK_SHIFT + || blockShift > DirectMonotonicWriter.MAX_BLOCK_SHIFT) { + throw new IllegalArgumentException( + "blockSize must be in " + + DirectMonotonicWriter.MIN_BLOCK_SHIFT + + "-" + + DirectMonotonicWriter.MAX_BLOCK_SHIFT + + ", got " + + blockShift); + } + this.blockShift = blockShift; + } + + @Override + public StoredFieldsReader fieldsReader( + Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + return new Lucene50CompressingStoredFieldsReader( + directory, si, segmentSuffix, fn, context, formatName, compressionMode); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) + throws IOException { + throw new UnsupportedOperationException("Old formats can't be used for writing"); + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(compressionMode=" + + compressionMode + + ", chunkSize=" + + chunkSize + + ", maxDocsPerChunk=" + + maxDocsPerChunk + + ", blockShift=" + + blockShift + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java similarity index 90% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java index 9b7fac438e1..a94dfb24f30 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java @@ -14,31 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; - -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY_ENCODING; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_EXTENSION; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR_ENCODING; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.INDEX_CODEC_NAME; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.INDEX_EXTENSION; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.META_EXTENSION; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.META_VERSION_START; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND_ENCODING; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_META; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_OFFHEAP_INDEX; -import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START; +package org.apache.lucene.backward_codecs.lucene50.compressing; import java.io.EOFException; import java.io.IOException; @@ -47,6 +23,8 @@ import java.util.Collection; import java.util.Collections; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Decompressor; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; @@ -71,11 +49,46 @@ import org.apache.lucene.util.LongsRef; import org.apache.lucene.util.packed.PackedInts; /** - * {@link StoredFieldsReader} impl for {@link CompressingStoredFieldsFormat}. + * {@link StoredFieldsReader} impl for {@link Lucene50CompressingStoredFieldsFormat}. * * @lucene.experimental */ -public final class CompressingStoredFieldsReader extends StoredFieldsReader { +public final class Lucene50CompressingStoredFieldsReader extends StoredFieldsReader { + + /** Extension of stored fields file */ + public static final String FIELDS_EXTENSION = "fdt"; + /** Extension of stored fields index */ + public static final String INDEX_EXTENSION = "fdx"; + /** Extension of stored fields meta */ + public static final String META_EXTENSION = "fdm"; + /** Codec name for the index. */ + public static final String INDEX_CODEC_NAME = "Lucene85FieldsIndex"; + + static final int STRING = 0x00; + static final int BYTE_ARR = 0x01; + static final int NUMERIC_INT = 0x02; + static final int NUMERIC_FLOAT = 0x03; + static final int NUMERIC_LONG = 0x04; + static final int NUMERIC_DOUBLE = 0x05; + + static final int TYPE_BITS = PackedInts.bitsRequired(NUMERIC_DOUBLE); + static final int TYPE_MASK = (int) PackedInts.maxValue(TYPE_BITS); + + static final int VERSION_START = 1; + static final int VERSION_OFFHEAP_INDEX = 2; + /** Version where all metadata were moved to the meta file. */ + static final int VERSION_META = 3; + + static final int VERSION_CURRENT = VERSION_META; + static final int META_VERSION_START = 0; + + // for compression of timestamps + static final long SECOND = 1000L; + static final long HOUR = 60 * 60 * SECOND; + static final long DAY = 24 * HOUR; + static final int SECOND_ENCODING = 0x40; + static final int HOUR_ENCODING = 0x80; + static final int DAY_ENCODING = 0xC0; private final int version; private final FieldInfos fieldInfos; @@ -94,7 +107,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { private boolean closed; // used by clone - private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader, boolean merging) { + private Lucene50CompressingStoredFieldsReader( + Lucene50CompressingStoredFieldsReader reader, boolean merging) { this.version = reader.version; this.fieldInfos = reader.fieldInfos; this.fieldsStream = reader.fieldsStream.clone(); @@ -113,7 +127,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { } /** Sole constructor. */ - public CompressingStoredFieldsReader( + public Lucene50CompressingStoredFieldsReader( Directory d, SegmentInfo si, String segmentSuffix, @@ -718,13 +732,13 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { @Override public StoredFieldsReader clone() { ensureOpen(); - return new CompressingStoredFieldsReader(this, false); + return new Lucene50CompressingStoredFieldsReader(this, false); } @Override public StoredFieldsReader getMergeInstance() { ensureOpen(); - return new CompressingStoredFieldsReader(this, true); + return new Lucene50CompressingStoredFieldsReader(this, true); } int getVersion() { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/BugfixDeflater_JDK8252739.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/BugfixDeflater_JDK8252739.java new file mode 100644 index 00000000000..a4fcc371c9d --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/BugfixDeflater_JDK8252739.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene87; + +import java.util.Arrays; +import java.util.zip.DataFormatException; +import java.util.zip.Deflater; +import java.util.zip.Inflater; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.SuppressForbidden; + +/** + * This class is a workaround for JDK bug JDK-8252739. + * + * @lucene.internal + */ +@FunctionalInterface +interface BugfixDeflater_JDK8252739 { + + public static final boolean IS_BUGGY_JDK = detectBuggyJDK(); + + /** + * Creates a bugfix for {@link Deflater} instances, which works around JDK-8252739. + * + *

Use this whenever you intend to call {@link Deflater#setDictionary(byte[], int, int)} on a + * {@code Deflater}. + */ + @SuppressForbidden(reason = "Works around bug, so it must call forbidden method") + public static BugfixDeflater_JDK8252739 createBugfix(Deflater deflater) { + if (IS_BUGGY_JDK) { + final BytesRefBuilder dictBytesScratch = new BytesRefBuilder(); + return (dictBytes, off, len) -> { + if (off > 0) { + dictBytesScratch.grow(len); + System.arraycopy(dictBytes, off, dictBytesScratch.bytes(), 0, len); + deflater.setDictionary(dictBytesScratch.bytes(), 0, len); + } else { + deflater.setDictionary(dictBytes, off, len); + } + }; + } else { + return deflater::setDictionary; + } + } + + /** Call this method as a workaround */ + void setDictionary(byte[] dictBytes, int off, int len); + + @SuppressForbidden(reason = "Detector for the bug, so it must call buggy method") + private static boolean detectBuggyJDK() { + final byte[] testData = new byte[] {1, 2, 3, 4, 5, 6, 7, 8}; + final byte[] compressed = new byte[32]; // way enough space + final Deflater deflater = new Deflater(6, true); + int compressedSize; + try { + deflater.reset(); + deflater.setDictionary(testData, 4, 4); + deflater.setInput(testData); + deflater.finish(); + compressedSize = deflater.deflate(compressed, 0, compressed.length, Deflater.FULL_FLUSH); + } finally { + deflater.end(); + } + + // in nowrap mode we need extra 0-byte as padding, add explicit: + compressed[compressedSize] = 0; + compressedSize++; + + final Inflater inflater = new Inflater(true); + final byte[] restored = new byte[testData.length]; + try { + inflater.reset(); + inflater.setDictionary(testData, 4, 4); + inflater.setInput(compressed, 0, compressedSize); + final int restoredLength = inflater.inflate(restored); + if (restoredLength != testData.length) { + return true; + } + } catch (DataFormatException e) { + return true; + } catch (RuntimeException e) { + return true; + } finally { + inflater.end(); + } + + if (Arrays.equals(testData, restored) == false) { + return true; + } + + // all fine + return false; + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/DeflateWithPresetDictCompressionMode.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/DeflateWithPresetDictCompressionMode.java new file mode 100644 index 00000000000..09f539ef0b5 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/DeflateWithPresetDictCompressionMode.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene87; + +import java.io.IOException; +import java.util.zip.DataFormatException; +import java.util.zip.Deflater; +import java.util.zip.Inflater; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Compressor; +import org.apache.lucene.codecs.compressing.Decompressor; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/** + * A compression mode that trades speed for compression ratio. Although compression and + * decompression might be slow, this compression mode should provide a good compression ratio. This + * mode might be interesting if/when your index size is much bigger than your OS cache. + * + * @lucene.internal + */ +public final class DeflateWithPresetDictCompressionMode extends CompressionMode { + + // Shoot for 10 sub blocks + private static final int NUM_SUB_BLOCKS = 10; + // And a dictionary whose size is about 6x smaller than sub blocks + private static final int DICT_SIZE_FACTOR = 6; + + /** Sole constructor. */ + public DeflateWithPresetDictCompressionMode() {} + + @Override + public Compressor newCompressor() { + // notes: + // 3 is the highest level that doesn't have lazy match evaluation + // 6 is the default, higher than that is just a waste of cpu + return new DeflateWithPresetDictCompressor(6); + } + + @Override + public Decompressor newDecompressor() { + return new DeflateWithPresetDictDecompressor(); + } + + @Override + public String toString() { + return "BEST_COMPRESSION"; + } + + private static final class DeflateWithPresetDictDecompressor extends Decompressor { + + byte[] compressed; + + DeflateWithPresetDictDecompressor() { + compressed = new byte[0]; + } + + private void doDecompress(DataInput in, Inflater decompressor, BytesRef bytes) + throws IOException { + final int compressedLength = in.readVInt(); + if (compressedLength == 0) { + return; + } + // pad with extra "dummy byte": see javadocs for using Inflater(true) + // we do it for compliance, but it's unnecessary for years in zlib. + final int paddedLength = compressedLength + 1; + compressed = ArrayUtil.grow(compressed, paddedLength); + in.readBytes(compressed, 0, compressedLength); + compressed[compressedLength] = 0; // explicitly set dummy byte to 0 + + // extra "dummy byte" + decompressor.setInput(compressed, 0, paddedLength); + try { + bytes.length += + decompressor.inflate(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length); + } catch (DataFormatException e) { + throw new IOException(e); + } + if (decompressor.finished() == false) { + throw new CorruptIndexException( + "Invalid decoder state: needsInput=" + + decompressor.needsInput() + + ", needsDict=" + + decompressor.needsDictionary(), + in); + } + } + + @Override + public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) + throws IOException { + assert offset + length <= originalLength; + if (length == 0) { + bytes.length = 0; + return; + } + final int dictLength = in.readVInt(); + final int blockLength = in.readVInt(); + bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength); + bytes.offset = bytes.length = 0; + + final Inflater decompressor = new Inflater(true); + try { + // Read the dictionary + doDecompress(in, decompressor, bytes); + if (dictLength != bytes.length) { + throw new CorruptIndexException("Unexpected dict length", in); + } + + int offsetInBlock = dictLength; + int offsetInBytesRef = offset; + + // Skip unneeded blocks + while (offsetInBlock + blockLength < offset) { + final int compressedLength = in.readVInt(); + in.skipBytes(compressedLength); + offsetInBlock += blockLength; + offsetInBytesRef -= blockLength; + } + + // Read blocks that intersect with the interval we need + while (offsetInBlock < offset + length) { + bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + blockLength); + decompressor.reset(); + decompressor.setDictionary(bytes.bytes, 0, dictLength); + doDecompress(in, decompressor, bytes); + offsetInBlock += blockLength; + } + + bytes.offset = offsetInBytesRef; + bytes.length = length; + assert bytes.isValid(); + } finally { + decompressor.end(); + } + } + + @Override + public Decompressor clone() { + return new DeflateWithPresetDictDecompressor(); + } + } + + private static class DeflateWithPresetDictCompressor extends Compressor { + + final Deflater compressor; + final BugfixDeflater_JDK8252739 deflaterBugfix; + byte[] compressed; + boolean closed; + + DeflateWithPresetDictCompressor(int level) { + compressor = new Deflater(level, true); + deflaterBugfix = BugfixDeflater_JDK8252739.createBugfix(compressor); + compressed = new byte[64]; + } + + private void doCompress(byte[] bytes, int off, int len, DataOutput out) throws IOException { + if (len == 0) { + out.writeVInt(0); + return; + } + compressor.setInput(bytes, off, len); + compressor.finish(); + if (compressor.needsInput()) { + throw new IllegalStateException(); + } + + int totalCount = 0; + for (; ; ) { + final int count = + compressor.deflate(compressed, totalCount, compressed.length - totalCount); + totalCount += count; + assert totalCount <= compressed.length; + if (compressor.finished()) { + break; + } else { + compressed = ArrayUtil.grow(compressed); + } + } + + out.writeVInt(totalCount); + out.writeBytes(compressed, totalCount); + } + + @Override + public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException { + final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR); + final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS; + out.writeVInt(dictLength); + out.writeVInt(blockLength); + final int end = off + len; + + // Compress the dictionary first + compressor.reset(); + doCompress(bytes, off, dictLength, out); + + // And then sub blocks + for (int start = off + dictLength; start < end; start += blockLength) { + compressor.reset(); + deflaterBugfix.setDictionary(bytes, off, dictLength); + doCompress(bytes, start, Math.min(blockLength, off + len - start), out); + } + } + + @Override + public void close() throws IOException { + if (closed == false) { + compressor.end(); + closed = true; + } + } + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/LZ4WithPresetDictCompressionMode.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/LZ4WithPresetDictCompressionMode.java new file mode 100644 index 00000000000..5efb49b6966 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/LZ4WithPresetDictCompressionMode.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene87; + +import java.io.IOException; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Compressor; +import org.apache.lucene.codecs.compressing.Decompressor; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.compress.LZ4; + +/** + * A compression mode that compromises on the compression ratio to provide fast compression and + * decompression. + * + * @lucene.internal + */ +public final class LZ4WithPresetDictCompressionMode extends CompressionMode { + + // Shoot for 10 sub blocks + private static final int NUM_SUB_BLOCKS = 10; + // And a dictionary whose size is about 16x smaller than sub blocks + private static final int DICT_SIZE_FACTOR = 16; + + /** Sole constructor. */ + public LZ4WithPresetDictCompressionMode() {} + + @Override + public Compressor newCompressor() { + return new LZ4WithPresetDictCompressor(); + } + + @Override + public Decompressor newDecompressor() { + return new LZ4WithPresetDictDecompressor(); + } + + @Override + public String toString() { + return "BEST_SPEED"; + } + + private static final class LZ4WithPresetDictDecompressor extends Decompressor { + + private int[] compressedLengths; + private byte[] buffer; + + LZ4WithPresetDictDecompressor() { + compressedLengths = new int[0]; + buffer = new byte[0]; + } + + private int readCompressedLengths( + DataInput in, int originalLength, int dictLength, int blockLength) throws IOException { + in.readVInt(); // compressed length of the dictionary, unused + int totalLength = dictLength; + int i = 0; + while (totalLength < originalLength) { + compressedLengths = ArrayUtil.grow(compressedLengths, i + 1); + compressedLengths[i++] = in.readVInt(); + totalLength += blockLength; + } + return i; + } + + @Override + public void decompress(DataInput in, int originalLength, int offset, int length, BytesRef bytes) + throws IOException { + assert offset + length <= originalLength; + + if (length == 0) { + bytes.length = 0; + return; + } + + final int dictLength = in.readVInt(); + final int blockLength = in.readVInt(); + + final int numBlocks = readCompressedLengths(in, originalLength, dictLength, blockLength); + + buffer = ArrayUtil.grow(buffer, dictLength + blockLength); + bytes.length = 0; + // Read the dictionary + if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) { + throw new CorruptIndexException("Illegal dict length", in); + } + + int offsetInBlock = dictLength; + int offsetInBytesRef = offset; + if (offset >= dictLength) { + offsetInBytesRef -= dictLength; + + // Skip unneeded blocks + int numBytesToSkip = 0; + for (int i = 0; i < numBlocks && offsetInBlock + blockLength < offset; ++i) { + int compressedBlockLength = compressedLengths[i]; + numBytesToSkip += compressedBlockLength; + offsetInBlock += blockLength; + offsetInBytesRef -= blockLength; + } + in.skipBytes(numBytesToSkip); + } else { + // The dictionary contains some bytes we need, copy its content to the BytesRef + bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength); + System.arraycopy(buffer, 0, bytes.bytes, 0, dictLength); + bytes.length = dictLength; + } + + // Read blocks that intersect with the interval we need + while (offsetInBlock < offset + length) { + final int bytesToDecompress = Math.min(blockLength, offset + length - offsetInBlock); + LZ4.decompress(in, bytesToDecompress, buffer, dictLength); + bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + bytesToDecompress); + System.arraycopy(buffer, dictLength, bytes.bytes, bytes.length, bytesToDecompress); + bytes.length += bytesToDecompress; + offsetInBlock += blockLength; + } + + bytes.offset = offsetInBytesRef; + bytes.length = length; + assert bytes.isValid(); + } + + @Override + public Decompressor clone() { + return new LZ4WithPresetDictDecompressor(); + } + } + + private static class LZ4WithPresetDictCompressor extends Compressor { + + final ByteBuffersDataOutput compressed; + final LZ4.FastCompressionHashTable hashTable; + byte[] buffer; + + LZ4WithPresetDictCompressor() { + compressed = ByteBuffersDataOutput.newResettableInstance(); + hashTable = new LZ4.FastCompressionHashTable(); + buffer = BytesRef.EMPTY_BYTES; + } + + private void doCompress(byte[] bytes, int dictLen, int len, DataOutput out) throws IOException { + long prevCompressedSize = compressed.size(); + LZ4.compressWithDictionary(bytes, 0, dictLen, len, compressed, hashTable); + // Write the number of compressed bytes + out.writeVInt(Math.toIntExact(compressed.size() - prevCompressedSize)); + } + + @Override + public void compress(byte[] bytes, int off, int len, DataOutput out) throws IOException { + final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR); + final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS; + buffer = ArrayUtil.grow(buffer, dictLength + blockLength); + out.writeVInt(dictLength); + out.writeVInt(blockLength); + final int end = off + len; + + compressed.reset(); + // Compress the dictionary first + System.arraycopy(bytes, off, buffer, 0, dictLength); + doCompress(buffer, 0, dictLength, out); + + // And then sub blocks + for (int start = off + dictLength; start < end; start += blockLength) { + int l = Math.min(blockLength, off + len - start); + System.arraycopy(bytes, start, buffer, dictLength, l); + doCompress(buffer, dictLength, l, out); + } + + // We only wrote lengths so far, now write compressed data + compressed.copyTo(out); + } + + @Override + public void close() throws IOException { + // no-op + } + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java index 5703d5e5efa..e261f51638b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87Codec.java @@ -40,7 +40,6 @@ import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat; -import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -64,8 +63,10 @@ public class Lucene87Codec extends Codec { Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene80DocValuesFormat.Mode.BEST_COMPRESSION); - private final Lucene87StoredFieldsFormat.Mode storedMode; - private final Lucene80DocValuesFormat.Mode dvMode; + /** compression mode for stored fields */ + protected final Lucene87StoredFieldsFormat.Mode storedMode; + /** compression mode for doc value fields */ + protected final Lucene80DocValuesFormat.Mode dvMode; private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene80DocValuesFormat.Mode dvMode) { this.storedMode = Objects.requireNonNull(storedMode); @@ -101,14 +102,23 @@ public class Lucene87Codec extends Codec { /** Instantiates a new codec. */ public Lucene87Codec() { + this(Mode.BEST_COMPRESSION); + } + + /** + * Instantiates a new codec, specifying the compression mode to use. + * + * @param mode compression mode to use for newly flushed/merged segments. + */ + public Lucene87Codec(Mode mode) { super("Lucene87"); - this.storedFieldsFormat = new Lucene87StoredFieldsFormat(); + this.storedFieldsFormat = new Lucene87StoredFieldsFormat(mode.storedMode); this.defaultFormat = new Lucene84PostingsFormat(); - this.defaultDVFormat = new Lucene80DocValuesFormat(); + this.defaultDVFormat = new Lucene80DocValuesFormat(mode.dvMode); } @Override - public final StoredFieldsFormat storedFieldsFormat() { + public StoredFieldsFormat storedFieldsFormat() { return storedFieldsFormat; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87StoredFieldsFormat.java similarity index 89% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87StoredFieldsFormat.java index bca468be3c0..bf4d634e2b5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene87/Lucene87StoredFieldsFormat.java @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.backward_codecs.lucene87; import java.io.IOException; import java.util.Objects; +import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; -import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat; import org.apache.lucene.codecs.compressing.CompressionMode; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentInfo; @@ -139,28 +139,16 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat { @Override public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { - String previous = si.putAttribute(MODE_KEY, mode.name()); - if (previous != null && previous.equals(mode.name()) == false) { - throw new IllegalStateException( - "found existing value for " - + MODE_KEY - + " for segment: " - + si.name - + "old=" - + previous - + ", new=" - + mode.name()); - } - return impl(mode).fieldsWriter(directory, si, context); + throw new UnsupportedOperationException("Old codecs may only be used for reading"); } StoredFieldsFormat impl(Mode mode) { switch (mode) { case BEST_SPEED: - return new CompressingStoredFieldsFormat( + return new Lucene50CompressingStoredFieldsFormat( "Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 1024, 10); case BEST_COMPRESSION: - return new CompressingStoredFieldsFormat( + return new Lucene50CompressingStoredFieldsFormat( "Lucene87StoredFieldsHighData", BEST_COMPRESSION_MODE, BEST_COMPRESSION_BLOCK_LENGTH, @@ -172,14 +160,16 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat { } // Shoot for 10 sub blocks of 48kB each. - private static final int BEST_COMPRESSION_BLOCK_LENGTH = 10 * 48 * 1024; + /** Block length for {@link Mode#BEST_COMPRESSION} */ + protected static final int BEST_COMPRESSION_BLOCK_LENGTH = 10 * 48 * 1024; /** Compression mode for {@link Mode#BEST_COMPRESSION} */ public static final CompressionMode BEST_COMPRESSION_MODE = new DeflateWithPresetDictCompressionMode(); // Shoot for 10 sub blocks of 60kB each. - private static final int BEST_SPEED_BLOCK_LENGTH = 10 * 60 * 1024; + /** Block length for {@link Mode#BEST_SPEED} */ + protected static final int BEST_SPEED_BLOCK_LENGTH = 10 * 60 * 1024; /** Compression mode for {@link Mode#BEST_SPEED} */ public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode(); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWStoredFieldsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWStoredFieldsFormat.java index 36fcc816121..9f8fdae1534 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWStoredFieldsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWStoredFieldsFormat.java @@ -17,7 +17,10 @@ package org.apache.lucene.backward_codecs.lucene50; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50RWCompressingStoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -52,4 +55,18 @@ public final class Lucene50RWStoredFieldsFormat extends Lucene50StoredFieldsForm } return impl(mode).fieldsWriter(directory, si, context); } + + @Override + StoredFieldsFormat impl(Mode mode) { + switch (mode) { + case BEST_SPEED: + return new Lucene50RWCompressingStoredFieldsFormat( + "Lucene50StoredFieldsFastData", CompressionMode.FAST, 1 << 14, 128, 10); + case BEST_COMPRESSION: + return new Lucene50RWCompressingStoredFieldsFormat( + "Lucene50StoredFieldsHighData", CompressionMode.HIGH_COMPRESSION, 61440, 512, 10); + default: + throw new AssertionError(); + } + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsWriter.java new file mode 100644 index 00000000000..bc4215998b2 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsWriter.java @@ -0,0 +1,736 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene50.compressing; + +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.BYTE_ARR; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.FIELDS_EXTENSION; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.INDEX_CODEC_NAME; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.INDEX_EXTENSION; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.META_EXTENSION; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.NUMERIC_DOUBLE; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.NUMERIC_FLOAT; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.NUMERIC_INT; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.NUMERIC_LONG; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.STRING; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.TYPE_BITS; +import static org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50CompressingStoredFieldsReader.VERSION_CURRENT; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Compressor; +import org.apache.lucene.codecs.compressing.MatchingReaders; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.PackedInts; + +/** + * {@link StoredFieldsWriter} impl for {@link Lucene50CompressingStoredFieldsFormat}. + * + * @lucene.experimental + */ +public final class Lucene50CompressingStoredFieldsWriter extends StoredFieldsWriter { + + private final String segment; + private FieldsIndexWriter indexWriter; + private IndexOutput metaStream, fieldsStream; + + private Compressor compressor; + private final CompressionMode compressionMode; + private final int chunkSize; + private final int maxDocsPerChunk; + + private final ByteBuffersDataOutput bufferedDocs; + private int[] numStoredFields; // number of stored fields + private int[] endOffsets; // end offsets in bufferedDocs + private int docBase; // doc ID at the beginning of the chunk + private int numBufferedDocs; // docBase + numBufferedDocs == current doc ID + + private long numDirtyChunks; // number of incomplete compressed blocks written + private long numDirtyDocs; // cumulative number of missing docs in incomplete chunks + + /** Sole constructor. */ + Lucene50CompressingStoredFieldsWriter( + Directory directory, + SegmentInfo si, + String segmentSuffix, + IOContext context, + String formatName, + CompressionMode compressionMode, + int chunkSize, + int maxDocsPerChunk, + int blockShift) + throws IOException { + assert directory != null; + this.segment = si.name; + this.compressionMode = compressionMode; + this.compressor = compressionMode.newCompressor(); + this.chunkSize = chunkSize; + this.maxDocsPerChunk = maxDocsPerChunk; + this.docBase = 0; + this.bufferedDocs = ByteBuffersDataOutput.newResettableInstance(); + this.numStoredFields = new int[16]; + this.endOffsets = new int[16]; + this.numBufferedDocs = 0; + + boolean success = false; + try { + metaStream = + directory.createOutput( + IndexFileNames.segmentFileName(segment, segmentSuffix, META_EXTENSION), context); + CodecUtil.writeIndexHeader( + metaStream, INDEX_CODEC_NAME + "Meta", VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(INDEX_CODEC_NAME + "Meta", segmentSuffix) + == metaStream.getFilePointer(); + + fieldsStream = + directory.createOutput( + IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION), context); + CodecUtil.writeIndexHeader( + fieldsStream, formatName, VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(formatName, segmentSuffix) + == fieldsStream.getFilePointer(); + + indexWriter = + new FieldsIndexWriter( + directory, + segment, + segmentSuffix, + INDEX_EXTENSION, + INDEX_CODEC_NAME, + si.getId(), + blockShift, + context); + + metaStream.writeVInt(chunkSize); + metaStream.writeVInt(PackedInts.VERSION_CURRENT); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(metaStream, fieldsStream, indexWriter); + } + } + } + + @Override + public void close() throws IOException { + try { + IOUtils.close(metaStream, fieldsStream, indexWriter, compressor); + } finally { + metaStream = null; + fieldsStream = null; + indexWriter = null; + compressor = null; + } + } + + private int numStoredFieldsInDoc; + + @Override + public void startDocument() throws IOException {} + + @Override + public void finishDocument() throws IOException { + if (numBufferedDocs == this.numStoredFields.length) { + final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4); + this.numStoredFields = ArrayUtil.growExact(this.numStoredFields, newLength); + endOffsets = ArrayUtil.growExact(endOffsets, newLength); + } + this.numStoredFields[numBufferedDocs] = numStoredFieldsInDoc; + numStoredFieldsInDoc = 0; + endOffsets[numBufferedDocs] = Math.toIntExact(bufferedDocs.size()); + ++numBufferedDocs; + if (triggerFlush()) { + flush(); + } + } + + private static void saveInts(int[] values, int length, DataOutput out) throws IOException { + assert length > 0; + if (length == 1) { + out.writeVInt(values[0]); + } else { + boolean allEqual = true; + for (int i = 1; i < length; ++i) { + if (values[i] != values[0]) { + allEqual = false; + break; + } + } + if (allEqual) { + out.writeVInt(0); + out.writeVInt(values[0]); + } else { + long max = 0; + for (int i = 0; i < length; ++i) { + max |= values[i]; + } + final int bitsRequired = PackedInts.bitsRequired(max); + out.writeVInt(bitsRequired); + final PackedInts.Writer w = + PackedInts.getWriterNoHeader(out, PackedInts.Format.PACKED, length, bitsRequired, 1); + for (int i = 0; i < length; ++i) { + w.add(values[i]); + } + w.finish(); + } + } + } + + private void writeHeader( + int docBase, int numBufferedDocs, int[] numStoredFields, int[] lengths, boolean sliced) + throws IOException { + final int slicedBit = sliced ? 1 : 0; + + // save docBase and numBufferedDocs + fieldsStream.writeVInt(docBase); + fieldsStream.writeVInt((numBufferedDocs) << 1 | slicedBit); + + // save numStoredFields + saveInts(numStoredFields, numBufferedDocs, fieldsStream); + + // save lengths + saveInts(lengths, numBufferedDocs, fieldsStream); + } + + private boolean triggerFlush() { + return bufferedDocs.size() >= chunkSize + || // chunks of at least chunkSize bytes + numBufferedDocs >= maxDocsPerChunk; + } + + private void flush() throws IOException { + indexWriter.writeIndex(numBufferedDocs, fieldsStream.getFilePointer()); + + // transform end offsets into lengths + final int[] lengths = endOffsets; + for (int i = numBufferedDocs - 1; i > 0; --i) { + lengths[i] = endOffsets[i] - endOffsets[i - 1]; + assert lengths[i] >= 0; + } + final boolean sliced = bufferedDocs.size() >= 2 * chunkSize; + writeHeader(docBase, numBufferedDocs, numStoredFields, lengths, sliced); + + // compress stored fields to fieldsStream. + // + // TODO: do we need to slice it since we already have the slices in the buffer? Perhaps + // we should use max-block-bits restriction on the buffer itself, then we won't have to check it + // here. + byte[] content = bufferedDocs.toArrayCopy(); + bufferedDocs.reset(); + + if (sliced) { + // big chunk, slice it + for (int compressed = 0; compressed < content.length; compressed += chunkSize) { + compressor.compress( + content, compressed, Math.min(chunkSize, content.length - compressed), fieldsStream); + } + } else { + compressor.compress(content, 0, content.length, fieldsStream); + } + + // reset + docBase += numBufferedDocs; + numBufferedDocs = 0; + bufferedDocs.reset(); + } + + @Override + public void writeField(FieldInfo info, IndexableField field) throws IOException { + + ++numStoredFieldsInDoc; + + int bits = 0; + final BytesRef bytes; + final String string; + + Number number = field.numericValue(); + if (number != null) { + if (number instanceof Byte || number instanceof Short || number instanceof Integer) { + bits = NUMERIC_INT; + } else if (number instanceof Long) { + bits = NUMERIC_LONG; + } else if (number instanceof Float) { + bits = NUMERIC_FLOAT; + } else if (number instanceof Double) { + bits = NUMERIC_DOUBLE; + } else { + throw new IllegalArgumentException("cannot store numeric type " + number.getClass()); + } + string = null; + bytes = null; + } else { + bytes = field.binaryValue(); + if (bytes != null) { + bits = BYTE_ARR; + string = null; + } else { + bits = STRING; + string = field.stringValue(); + if (string == null) { + throw new IllegalArgumentException( + "field " + + field.name() + + " is stored but does not have binaryValue, stringValue nor numericValue"); + } + } + } + + final long infoAndBits = (((long) info.number) << TYPE_BITS) | bits; + bufferedDocs.writeVLong(infoAndBits); + + if (bytes != null) { + bufferedDocs.writeVInt(bytes.length); + bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } else if (string != null) { + bufferedDocs.writeString(string); + } else { + if (number instanceof Byte || number instanceof Short || number instanceof Integer) { + bufferedDocs.writeZInt(number.intValue()); + } else if (number instanceof Long) { + writeTLong(bufferedDocs, number.longValue()); + } else if (number instanceof Float) { + writeZFloat(bufferedDocs, number.floatValue()); + } else if (number instanceof Double) { + writeZDouble(bufferedDocs, number.doubleValue()); + } else { + throw new AssertionError("Cannot get here"); + } + } + } + + // -0 isn't compressed. + static final int NEGATIVE_ZERO_FLOAT = Float.floatToIntBits(-0f); + static final long NEGATIVE_ZERO_DOUBLE = Double.doubleToLongBits(-0d); + + // for compression of timestamps + static final long SECOND = 1000L; + static final long HOUR = 60 * 60 * SECOND; + static final long DAY = 24 * HOUR; + static final int SECOND_ENCODING = 0x40; + static final int HOUR_ENCODING = 0x80; + static final int DAY_ENCODING = 0xC0; + + /** + * Writes a float in a variable-length format. Writes between one and five bytes. Small integral + * values typically take fewer bytes. + * + *

ZFloat --> Header, Bytes*? + * + *

+ */ + static void writeZFloat(DataOutput out, float f) throws IOException { + int intVal = (int) f; + final int floatBits = Float.floatToIntBits(f); + + if (f == intVal && intVal >= -1 && intVal <= 0x7D && floatBits != NEGATIVE_ZERO_FLOAT) { + // small integer value [-1..125]: single byte + out.writeByte((byte) (0x80 | (1 + intVal))); + } else if ((floatBits >>> 31) == 0) { + // other positive floats: 4 bytes + out.writeInt(floatBits); + } else { + // other negative float: 5 bytes + out.writeByte((byte) 0xFF); + out.writeInt(floatBits); + } + } + + /** + * Writes a float in a variable-length format. Writes between one and five bytes. Small integral + * values typically take fewer bytes. + * + *

ZFloat --> Header, Bytes*? + * + *

+ */ + static void writeZDouble(DataOutput out, double d) throws IOException { + int intVal = (int) d; + final long doubleBits = Double.doubleToLongBits(d); + + if (d == intVal && intVal >= -1 && intVal <= 0x7C && doubleBits != NEGATIVE_ZERO_DOUBLE) { + // small integer value [-1..124]: single byte + out.writeByte((byte) (0x80 | (intVal + 1))); + return; + } else if (d == (float) d) { + // d has an accurate float representation: 5 bytes + out.writeByte((byte) 0xFE); + out.writeInt(Float.floatToIntBits((float) d)); + } else if ((doubleBits >>> 63) == 0) { + // other positive doubles: 8 bytes + out.writeLong(doubleBits); + } else { + // other negative doubles: 9 bytes + out.writeByte((byte) 0xFF); + out.writeLong(doubleBits); + } + } + + /** + * Writes a long in a variable-length format. Writes between one and ten bytes. Small values or + * values representing timestamps with day, hour or second precision typically require fewer + * bytes. + * + *

ZLong --> Header, Bytes*? + * + *

+ */ + // T for "timestamp" + static void writeTLong(DataOutput out, long l) throws IOException { + int header; + if (l % SECOND != 0) { + header = 0; + } else if (l % DAY == 0) { + // timestamp with day precision + header = DAY_ENCODING; + l /= DAY; + } else if (l % HOUR == 0) { + // timestamp with hour precision, or day precision with a timezone + header = HOUR_ENCODING; + l /= HOUR; + } else { + // timestamp with second precision + header = SECOND_ENCODING; + l /= SECOND; + } + + final long zigZagL = BitUtil.zigZagEncode(l); + header |= (zigZagL & 0x1F); // last 5 bits + final long upperBits = zigZagL >>> 5; + if (upperBits != 0) { + header |= 0x20; + } + out.writeByte((byte) header); + if (upperBits != 0) { + out.writeVLong(upperBits); + } + } + + @Override + public void finish(FieldInfos fis, int numDocs) throws IOException { + if (numBufferedDocs > 0) { + numDirtyChunks++; // incomplete: we had to force this flush + final long expectedChunkDocs = + Math.min( + maxDocsPerChunk, (long) ((double) chunkSize / bufferedDocs.size() * numBufferedDocs)); + numDirtyDocs += expectedChunkDocs - numBufferedDocs; + flush(); + } else { + assert bufferedDocs.size() == 0; + } + if (docBase != numDocs) { + throw new RuntimeException( + "Wrote " + docBase + " docs, finish called with numDocs=" + numDocs); + } + indexWriter.finish(numDocs, fieldsStream.getFilePointer(), metaStream); + metaStream.writeVLong(numDirtyChunks); + metaStream.writeVLong(numDirtyDocs); + CodecUtil.writeFooter(metaStream); + CodecUtil.writeFooter(fieldsStream); + assert bufferedDocs.size() == 0; + } + + // bulk merge is scary: its caused corruption bugs in the past. + // we try to be extra safe with this impl, but add an escape hatch to + // have a workaround for undiscovered bugs. + static final String BULK_MERGE_ENABLED_SYSPROP = + Lucene50CompressingStoredFieldsWriter.class.getName() + ".enableBulkMerge"; + static final boolean BULK_MERGE_ENABLED; + + static { + boolean v = true; + try { + v = Boolean.parseBoolean(System.getProperty(BULK_MERGE_ENABLED_SYSPROP, "true")); + } catch (SecurityException ignored) { + } + BULK_MERGE_ENABLED = v; + } + + @Override + public int merge(MergeState mergeState) throws IOException { + int docCount = 0; + int numReaders = mergeState.maxDocs.length; + + MatchingReaders matching = new MatchingReaders(mergeState); + if (mergeState.needsIndexSort) { + /** + * If all readers are compressed and they have the same fieldinfos then we can merge the + * serialized document directly. + */ + List subs = new ArrayList<>(); + for (int i = 0; i < mergeState.storedFieldsReaders.length; i++) { + if (matching.matchingReaders[i] + && mergeState.storedFieldsReaders[i] instanceof Lucene50CompressingStoredFieldsReader) { + Lucene50CompressingStoredFieldsReader storedFieldsReader = + (Lucene50CompressingStoredFieldsReader) mergeState.storedFieldsReaders[i]; + storedFieldsReader.checkIntegrity(); + subs.add( + new CompressingStoredFieldsMergeSub( + storedFieldsReader, mergeState.docMaps[i], mergeState.maxDocs[i])); + } else { + return super.merge(mergeState); + } + } + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, true); + while (true) { + CompressingStoredFieldsMergeSub sub = docIDMerger.next(); + if (sub == null) { + break; + } + assert sub.mappedDocID == docCount; + Lucene50CompressingStoredFieldsReader.SerializedDocument doc = + sub.reader.document(sub.docID); + startDocument(); + bufferedDocs.copyBytes(doc.in, doc.length); + numStoredFieldsInDoc = doc.numStoredFields; + finishDocument(); + ++docCount; + } + finish(mergeState.mergeFieldInfos, docCount); + return docCount; + } + + for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) { + MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex); + Lucene50CompressingStoredFieldsReader matchingFieldsReader = null; + if (matching.matchingReaders[readerIndex]) { + final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex]; + // we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader + if (fieldsReader != null && fieldsReader instanceof Lucene50CompressingStoredFieldsReader) { + matchingFieldsReader = (Lucene50CompressingStoredFieldsReader) fieldsReader; + } + } + + final int maxDoc = mergeState.maxDocs[readerIndex]; + final Bits liveDocs = mergeState.liveDocs[readerIndex]; + + // if its some other format, or an older version of this format, or safety switch: + if (matchingFieldsReader == null + || matchingFieldsReader.getVersion() != VERSION_CURRENT + || BULK_MERGE_ENABLED == false) { + // naive merge... + StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[readerIndex]; + if (storedFieldsReader != null) { + storedFieldsReader.checkIntegrity(); + } + for (int docID = 0; docID < maxDoc; docID++) { + if (liveDocs != null && liveDocs.get(docID) == false) { + continue; + } + startDocument(); + storedFieldsReader.visitDocument(docID, visitor); + finishDocument(); + ++docCount; + } + } else if (matchingFieldsReader.getCompressionMode() == compressionMode + && matchingFieldsReader.getChunkSize() == chunkSize + && matchingFieldsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT + && liveDocs == null + && !tooDirty(matchingFieldsReader)) { + // optimized merge, raw byte copy + // its not worth fine-graining this if there are deletions. + + // if the format is older, its always handled by the naive merge case above + assert matchingFieldsReader.getVersion() == VERSION_CURRENT; + matchingFieldsReader.checkIntegrity(); + + // flush any pending chunks + if (numBufferedDocs > 0) { + flush(); + numDirtyChunks++; // incomplete: we had to force this flush + } + + // iterate over each chunk. we use the stored fields index to find chunk boundaries, + // read the docstart + doccount from the chunk header (we write a new header, since doc + // numbers will change), + // and just copy the bytes directly. + IndexInput rawDocs = matchingFieldsReader.getFieldsStream(); + FieldsIndex index = matchingFieldsReader.getIndexReader(); + rawDocs.seek(index.getStartPointer(0)); + int docID = 0; + while (docID < maxDoc) { + // read header + int base = rawDocs.readVInt(); + if (base != docID) { + throw new CorruptIndexException( + "invalid state: base=" + base + ", docID=" + docID, rawDocs); + } + int code = rawDocs.readVInt(); + + // write a new index entry and new header for this chunk. + int bufferedDocs = code >>> 1; + indexWriter.writeIndex(bufferedDocs, fieldsStream.getFilePointer()); + fieldsStream.writeVInt(docBase); // rebase + fieldsStream.writeVInt(code); + docID += bufferedDocs; + docBase += bufferedDocs; + docCount += bufferedDocs; + + if (docID > maxDoc) { + throw new CorruptIndexException( + "invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, + rawDocs); + } + + // copy bytes until the next chunk boundary (or end of chunk data). + // using the stored fields index for this isn't the most efficient, but fast enough + // and is a source of redundancy for detecting bad things. + final long end; + if (docID == maxDoc) { + end = matchingFieldsReader.getMaxPointer(); + } else { + end = index.getStartPointer(docID); + } + fieldsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer()); + } + + if (rawDocs.getFilePointer() != matchingFieldsReader.getMaxPointer()) { + throw new CorruptIndexException( + "invalid state: pos=" + + rawDocs.getFilePointer() + + ", max=" + + matchingFieldsReader.getMaxPointer(), + rawDocs); + } + + // since we bulk merged all chunks, we inherit any dirty ones from this segment. + numDirtyChunks += matchingFieldsReader.getNumDirtyChunks(); + numDirtyDocs += matchingFieldsReader.getNumDirtyDocs(); + } else { + // optimized merge, we copy serialized (but decompressed) bytes directly + // even on simple docs (1 stored field), it seems to help by about 20% + + // if the format is older, its always handled by the naive merge case above + assert matchingFieldsReader.getVersion() == VERSION_CURRENT; + matchingFieldsReader.checkIntegrity(); + + for (int docID = 0; docID < maxDoc; docID++) { + if (liveDocs != null && liveDocs.get(docID) == false) { + continue; + } + Lucene50CompressingStoredFieldsReader.SerializedDocument doc = + matchingFieldsReader.document(docID); + startDocument(); + bufferedDocs.copyBytes(doc.in, doc.length); + numStoredFieldsInDoc = doc.numStoredFields; + finishDocument(); + ++docCount; + } + } + } + finish(mergeState.mergeFieldInfos, docCount); + return docCount; + } + + /** + * Returns true if we should recompress this reader, even though we could bulk merge compressed + * data + * + *

The last chunk written for a segment is typically incomplete, so without recompressing, in + * some worst-case situations (e.g. frequent reopen with tiny flushes), over time the compression + * ratio can degrade. This is a safety switch. + */ + boolean tooDirty(Lucene50CompressingStoredFieldsReader candidate) { + // more than 1% dirty, or more than hard limit of 1024 dirty chunks + return candidate.getNumDirtyChunks() > 1024 + || candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs(); + } + + private static class CompressingStoredFieldsMergeSub extends DocIDMerger.Sub { + private final Lucene50CompressingStoredFieldsReader reader; + private final int maxDoc; + int docID = -1; + + CompressingStoredFieldsMergeSub( + Lucene50CompressingStoredFieldsReader reader, MergeState.DocMap docMap, int maxDoc) { + super(docMap); + this.maxDoc = maxDoc; + this.reader = reader; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + return NO_MORE_DOCS; + } else { + return docID; + } + } + } + + @Override + public long ramBytesUsed() { + return bufferedDocs.ramBytesUsed() + + numStoredFields.length * Integer.BYTES + + endOffsets.length * Integer.BYTES; + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingStoredFieldsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingStoredFieldsFormat.java new file mode 100644 index 00000000000..759d674c055 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50RWCompressingStoredFieldsFormat.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene50.compressing; + +import java.io.IOException; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; + +/** RW impersonation of Lucene50CompressingStoredFieldsFormat. */ +public class Lucene50RWCompressingStoredFieldsFormat extends Lucene50CompressingStoredFieldsFormat { + + public Lucene50RWCompressingStoredFieldsFormat( + String formatName, + CompressionMode compressionMode, + int chunkSize, + int maxDocsPerChunk, + int blockShift) { + super(formatName, compressionMode, chunkSize, maxDocsPerChunk, blockShift); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) + throws IOException { + return new Lucene50CompressingStoredFieldsWriter( + directory, + si, + segmentSuffix, + context, + formatName, + compressionMode, + chunkSize, + maxDocsPerChunk, + blockShift); + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java index a5af4089e1f..846c9d21416 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWCodec.java @@ -23,6 +23,7 @@ import org.apache.lucene.backward_codecs.lucene84.Lucene84RWPostingsFormat; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -37,6 +38,16 @@ public class Lucene87RWCodec extends Lucene87Codec { return defaultPF; } }; + private final Mode mode; + + public Lucene87RWCodec() { + this(Mode.BEST_SPEED); + } + + public Lucene87RWCodec(Mode mode) { + super(mode); + this.mode = mode; + } @Override public final CompoundFormat compoundFormat() { @@ -57,4 +68,9 @@ public class Lucene87RWCodec extends Lucene87Codec { public TermVectorsFormat termVectorsFormat() { return new Lucene50RWTermVectorsFormat(); } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return new Lucene87RWStoredFieldsFormat(mode.storedMode); + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWStoredFieldsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWStoredFieldsFormat.java new file mode 100644 index 00000000000..8144469f2b8 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/Lucene87RWStoredFieldsFormat.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene87; + +import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene50.compressing.Lucene50RWCompressingStoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; + +; + +/** RW impersonation of Lucene87StoredFieldsFormat. */ +public class Lucene87RWStoredFieldsFormat extends Lucene87StoredFieldsFormat { + + public Lucene87RWStoredFieldsFormat(Mode mode) { + super(mode); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) + throws IOException { + String previous = si.putAttribute(MODE_KEY, mode.name()); + if (previous != null && previous.equals(mode.name()) == false) { + throw new IllegalStateException( + "found existing value for " + + MODE_KEY + + " for segment: " + + si.name + + "old=" + + previous + + ", new=" + + mode.name()); + } + return impl(mode).fieldsWriter(directory, si, context); + } + + @Override + StoredFieldsFormat impl(Mode mode) { + switch (mode) { + case BEST_SPEED: + return new Lucene50RWCompressingStoredFieldsFormat( + "Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 1024, 10); + case BEST_COMPRESSION: + return new Lucene50RWCompressingStoredFieldsFormat( + "Lucene87StoredFieldsHighData", + BEST_COMPRESSION_MODE, + BEST_COMPRESSION_BLOCK_LENGTH, + 4096, + 10); + default: + throw new AssertionError(); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormat.java similarity index 89% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormat.java index 5604d412a82..469d1547339 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormat.java @@ -14,15 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.backward_codecs.lucene87; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; -import org.apache.lucene.util.TestUtil; public class TestLucene87StoredFieldsFormat extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return TestUtil.getDefaultCodec(); + return new Lucene87RWCodec(); } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java similarity index 89% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java index c2df5277ebb..68023138245 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java @@ -14,12 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.backward_codecs.lucene87; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene90.Lucene90Codec; -import org.apache.lucene.codecs.lucene90.Lucene90Codec.Mode; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; @@ -31,7 +29,7 @@ import org.apache.lucene.store.Directory; public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene90Codec(Mode.BEST_COMPRESSION); + return new Lucene87RWCodec(Lucene87Codec.Mode.BEST_COMPRESSION); } /** @@ -41,7 +39,8 @@ public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFie Directory dir = newDirectory(); for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values()))); + iwc.setCodec( + new Lucene87RWCodec(RandomPicks.randomFrom(random(), Lucene87Codec.Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -70,7 +69,7 @@ public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFie expectThrows( NullPointerException.class, () -> { - new Lucene90Codec(null); + new Lucene87RWCodec(null); }); expectThrows( diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java similarity index 89% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java index e28f228c68b..282f2278bfd 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene87/TestLucene87StoredFieldsFormatMergeInstance.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.backward_codecs.lucene87; -/** Test the merge instance of the Lucene50 stored fields format. */ +/** Test the merge instance of the Lucene87 stored fields format. */ public class TestLucene87StoredFieldsFormatMergeInstance extends TestLucene87StoredFieldsFormat { @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/LegacyFieldsIndexReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/LegacyFieldsIndexReader.java deleted file mode 100644 index 8cc2fb16d13..00000000000 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/LegacyFieldsIndexReader.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.compressing; - -import static org.apache.lucene.util.BitUtil.zigZagDecode; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Accountables; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.packed.PackedInts; - -/** - * Random-access reader for {@link FieldsIndexWriter}. - * - * @lucene.internal - */ -final class LegacyFieldsIndexReader extends FieldsIndex { - - private static final long BASE_RAM_BYTES_USED = - RamUsageEstimator.shallowSizeOfInstance(LegacyFieldsIndexReader.class); - - final int maxDoc; - final int[] docBases; - final long[] startPointers; - final int[] avgChunkDocs; - final long[] avgChunkSizes; - final PackedInts.Reader[] docBasesDeltas; // delta from the avg - final PackedInts.Reader[] startPointersDeltas; // delta from the avg - - // It is the responsibility of the caller to close fieldsIndexIn after this constructor - // has been called - LegacyFieldsIndexReader(IndexInput fieldsIndexIn, SegmentInfo si) throws IOException { - maxDoc = si.maxDoc(); - int[] docBases = new int[16]; - long[] startPointers = new long[16]; - int[] avgChunkDocs = new int[16]; - long[] avgChunkSizes = new long[16]; - PackedInts.Reader[] docBasesDeltas = new PackedInts.Reader[16]; - PackedInts.Reader[] startPointersDeltas = new PackedInts.Reader[16]; - - final int packedIntsVersion = fieldsIndexIn.readVInt(); - - int blockCount = 0; - - for (; ; ) { - final int numChunks = fieldsIndexIn.readVInt(); - if (numChunks == 0) { - break; - } - if (blockCount == docBases.length) { - final int newSize = ArrayUtil.oversize(blockCount + 1, 8); - docBases = ArrayUtil.growExact(docBases, newSize); - startPointers = ArrayUtil.growExact(startPointers, newSize); - avgChunkDocs = ArrayUtil.growExact(avgChunkDocs, newSize); - avgChunkSizes = ArrayUtil.growExact(avgChunkSizes, newSize); - docBasesDeltas = ArrayUtil.growExact(docBasesDeltas, newSize); - startPointersDeltas = ArrayUtil.growExact(startPointersDeltas, newSize); - } - - // doc bases - docBases[blockCount] = fieldsIndexIn.readVInt(); - avgChunkDocs[blockCount] = fieldsIndexIn.readVInt(); - final int bitsPerDocBase = fieldsIndexIn.readVInt(); - if (bitsPerDocBase > 32) { - throw new CorruptIndexException( - "Corrupted bitsPerDocBase: " + bitsPerDocBase, fieldsIndexIn); - } - docBasesDeltas[blockCount] = - PackedInts.getReaderNoHeader( - fieldsIndexIn, - PackedInts.Format.PACKED, - packedIntsVersion, - numChunks, - bitsPerDocBase); - - // start pointers - startPointers[blockCount] = fieldsIndexIn.readVLong(); - avgChunkSizes[blockCount] = fieldsIndexIn.readVLong(); - final int bitsPerStartPointer = fieldsIndexIn.readVInt(); - if (bitsPerStartPointer > 64) { - throw new CorruptIndexException( - "Corrupted bitsPerStartPointer: " + bitsPerStartPointer, fieldsIndexIn); - } - startPointersDeltas[blockCount] = - PackedInts.getReaderNoHeader( - fieldsIndexIn, - PackedInts.Format.PACKED, - packedIntsVersion, - numChunks, - bitsPerStartPointer); - - ++blockCount; - } - - this.docBases = ArrayUtil.copyOfSubArray(docBases, 0, blockCount); - this.startPointers = ArrayUtil.copyOfSubArray(startPointers, 0, blockCount); - this.avgChunkDocs = ArrayUtil.copyOfSubArray(avgChunkDocs, 0, blockCount); - this.avgChunkSizes = ArrayUtil.copyOfSubArray(avgChunkSizes, 0, blockCount); - this.docBasesDeltas = ArrayUtil.copyOfSubArray(docBasesDeltas, 0, blockCount); - this.startPointersDeltas = ArrayUtil.copyOfSubArray(startPointersDeltas, 0, blockCount); - } - - private int block(int docID) { - int lo = 0, hi = docBases.length - 1; - while (lo <= hi) { - final int mid = (lo + hi) >>> 1; - final int midValue = docBases[mid]; - if (midValue == docID) { - return mid; - } else if (midValue < docID) { - lo = mid + 1; - } else { - hi = mid - 1; - } - } - return hi; - } - - private int relativeDocBase(int block, int relativeChunk) { - final int expected = avgChunkDocs[block] * relativeChunk; - final long delta = zigZagDecode(docBasesDeltas[block].get(relativeChunk)); - return expected + (int) delta; - } - - private long relativeStartPointer(int block, int relativeChunk) { - final long expected = avgChunkSizes[block] * relativeChunk; - final long delta = zigZagDecode(startPointersDeltas[block].get(relativeChunk)); - return expected + delta; - } - - private int relativeChunk(int block, int relativeDoc) { - int lo = 0, hi = docBasesDeltas[block].size() - 1; - while (lo <= hi) { - final int mid = (lo + hi) >>> 1; - final int midValue = relativeDocBase(block, mid); - if (midValue == relativeDoc) { - return mid; - } else if (midValue < relativeDoc) { - lo = mid + 1; - } else { - hi = mid - 1; - } - } - return hi; - } - - long getStartPointer(int docID) { - if (docID < 0 || docID >= maxDoc) { - throw new IllegalArgumentException("docID out of range [0-" + maxDoc + "]: " + docID); - } - final int block = block(docID); - final int relativeChunk = relativeChunk(block, docID - docBases[block]); - return startPointers[block] + relativeStartPointer(block, relativeChunk); - } - - @Override - public LegacyFieldsIndexReader clone() { - return this; - } - - @Override - public long ramBytesUsed() { - long res = BASE_RAM_BYTES_USED; - - res += RamUsageEstimator.shallowSizeOf(docBasesDeltas); - for (PackedInts.Reader r : docBasesDeltas) { - res += r.ramBytesUsed(); - } - res += RamUsageEstimator.shallowSizeOf(startPointersDeltas); - for (PackedInts.Reader r : startPointersDeltas) { - res += r.ramBytesUsed(); - } - - res += RamUsageEstimator.sizeOf(docBases); - res += RamUsageEstimator.sizeOf(startPointers); - res += RamUsageEstimator.sizeOf(avgChunkDocs); - res += RamUsageEstimator.sizeOf(avgChunkSizes); - - return res; - } - - @Override - public Collection getChildResources() { - List resources = new ArrayList<>(); - - long docBaseDeltaBytes = RamUsageEstimator.shallowSizeOf(docBasesDeltas); - for (PackedInts.Reader r : docBasesDeltas) { - docBaseDeltaBytes += r.ramBytesUsed(); - } - resources.add(Accountables.namedAccountable("doc base deltas", docBaseDeltaBytes)); - - long startPointerDeltaBytes = RamUsageEstimator.shallowSizeOf(startPointersDeltas); - for (PackedInts.Reader r : startPointersDeltas) { - startPointerDeltaBytes += r.ramBytesUsed(); - } - resources.add(Accountables.namedAccountable("start pointer deltas", startPointerDeltaBytes)); - - return Collections.unmodifiableList(resources); - } - - @Override - public String toString() { - return getClass().getSimpleName() + "(blocks=" + docBases.length + ")"; - } - - @Override - public void close() throws IOException { - // nothing to do - } - - @Override - void checkIntegrity() throws IOException { - // nothing to do, the index is checked at open time - } -} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/package-info.java index d63b0be6cb8..d59a81ba96b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/package-info.java @@ -15,5 +15,5 @@ * limitations under the License. */ -/** StoredFieldsFormat that allows cross-document and cross-field compression of stored fields. */ +/** Compressing helper classes. */ package org.apache.lucene.codecs.compressing; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/BugfixDeflater_JDK8252739.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/BugfixDeflater_JDK8252739.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/BugfixDeflater_JDK8252739.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/BugfixDeflater_JDK8252739.java index 8aa97925694..474ffb25a96 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/BugfixDeflater_JDK8252739.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/BugfixDeflater_JDK8252739.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.codecs.lucene90; import java.util.Arrays; import java.util.zip.DataFormatException; @@ -26,6 +26,8 @@ import org.apache.lucene.util.SuppressForbidden; /** * This class is a workaround for JDK bug JDK-8252739. + * + * @lucene.internal */ @FunctionalInterface interface BugfixDeflater_JDK8252739 { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/DeflateWithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/DeflateWithPresetDictCompressionMode.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/DeflateWithPresetDictCompressionMode.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/DeflateWithPresetDictCompressionMode.java index 0f5b2b02989..acd9a10df70 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/DeflateWithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/DeflateWithPresetDictCompressionMode.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.codecs.lucene90; import java.io.IOException; import java.util.zip.DataFormatException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/LZ4WithPresetDictCompressionMode.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java index e24d327b5bb..4506514f13a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/LZ4WithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene87; +package org.apache.lucene.codecs.lucene90; import java.io.IOException; import org.apache.lucene.codecs.compressing.CompressionMode; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java index 29547a21afe..c2491282178 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java @@ -32,7 +32,6 @@ import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat; -import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; @@ -49,16 +48,16 @@ public class Lucene90Codec extends Codec { /** Configuration option for the codec. */ public static enum Mode { /** Trade compression ratio for retrieval speed. */ - BEST_SPEED(Lucene87StoredFieldsFormat.Mode.BEST_SPEED, Lucene90DocValuesFormat.Mode.BEST_SPEED), + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED, Lucene90DocValuesFormat.Mode.BEST_SPEED), /** Trade retrieval speed for compression ratio. */ BEST_COMPRESSION( - Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, + Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene90DocValuesFormat.Mode.BEST_COMPRESSION); - private final Lucene87StoredFieldsFormat.Mode storedMode; + private final Lucene90StoredFieldsFormat.Mode storedMode; private final Lucene90DocValuesFormat.Mode dvMode; - private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene90DocValuesFormat.Mode dvMode) { + private Mode(Lucene90StoredFieldsFormat.Mode storedMode, Lucene90DocValuesFormat.Mode dvMode) { this.storedMode = Objects.requireNonNull(storedMode); this.dvMode = Objects.requireNonNull(dvMode); } @@ -104,7 +103,7 @@ public class Lucene90Codec extends Codec { public Lucene90Codec(Mode mode) { super("Lucene90"); this.storedFieldsFormat = - new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); this.defaultFormat = new Lucene90PostingsFormat(); this.defaultDVFormat = new Lucene90DocValuesFormat(mode.dvMode); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java new file mode 100644 index 00000000000..104223e6f2d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.Objects; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsFormat; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.packed.DirectMonotonicWriter; + +/** + * Lucene 8.7 stored fields format. + * + *

Principle + * + *

This {@link StoredFieldsFormat} compresses blocks of documents in order to improve the + * compression ratio compared to document-level compression. It uses the LZ4 compression algorithm by default in 16KB blocks, + * which is fast to compress and very fast to decompress data. Although the default compression + * method that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on + * compression ratio, it should provide interesting compression ratios for redundant inputs (such as + * log files, HTML or plain text). For higher compression, you can choose ({@link + * Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses the DEFLATE algorithm with 48kB blocks and shared + * dictionaries for a better ratio at the expense of slower performance. These two options can be + * configured like this: + * + *

+ *   // the default: for high performance
+ *   indexWriterConfig.setCodec(new Lucene87Codec(Mode.BEST_SPEED));
+ *   // instead for higher performance (but slower):
+ *   // indexWriterConfig.setCodec(new Lucene87Codec(Mode.BEST_COMPRESSION));
+ * 
+ * + *

File formats + * + *

Stored fields are represented by three files: + * + *

    + *
  1. + *

    A fields data file (extension .fdt). This file stores a compact + * representation of documents in compressed blocks of 16KB or more. When writing a segment, + * documents are appended to an in-memory byte[] buffer. When its size reaches + * 16KB or more, some metadata about the documents is flushed to disk, immediately followed by + * a compressed representation of the buffer using the LZ4 compression + * format. + *

    Notes + *

      + *
    • When at least one document in a chunk is large enough so that the chunk is larger + * than 32KB, the chunk will actually be compressed in several LZ4 blocks of 16KB. This + * allows {@link StoredFieldVisitor}s which are only interested in the first fields of a + * document to not have to decompress 10MB of data if the document is 10MB, but only + * 16KB. + *
    • Given that the original lengths are written in the metadata of the chunk, the + * decompressor can leverage this information to stop decoding as soon as enough data + * has been decompressed. + *
    • In case documents are incompressible, the overhead of the compression format is less + * than 0.5%. + *
    + *
  2. + *

    A fields index file (extension .fdx). This file stores two {@link + * DirectMonotonicWriter monotonic arrays}, one for the first doc IDs of each block of + * compressed documents, and another one for the corresponding offsets on disk. At search + * time, the array containing doc IDs is binary-searched in order to find the block that + * contains the expected doc ID, and the associated offset on disk is retrieved from the + * second array. + *

  3. + *

    A fields meta file (extension .fdm). This file stores metadata about the + * monotonic arrays stored in the index file. + *

+ * + *

Known limitations + * + *

This {@link StoredFieldsFormat} does not support individual documents larger than ( + * 231 - 214) bytes. + * + * @lucene.experimental + */ +public class Lucene90StoredFieldsFormat extends StoredFieldsFormat { + + /** Configuration option for stored fields. */ + public static enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED, + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION + } + + /** Attribute key for compression mode. */ + public static final String MODE_KEY = Lucene90StoredFieldsFormat.class.getSimpleName() + ".mode"; + + final Mode mode; + + /** Stored fields format with default options */ + public Lucene90StoredFieldsFormat() { + this(Mode.BEST_SPEED); + } + + /** Stored fields format with specified mode */ + public Lucene90StoredFieldsFormat(Mode mode) { + this.mode = Objects.requireNonNull(mode); + } + + @Override + public StoredFieldsReader fieldsReader( + Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + String value = si.getAttribute(MODE_KEY); + if (value == null) { + throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name); + } + Mode mode = Mode.valueOf(value); + return impl(mode).fieldsReader(directory, si, fn, context); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) + throws IOException { + String previous = si.putAttribute(MODE_KEY, mode.name()); + if (previous != null && previous.equals(mode.name()) == false) { + throw new IllegalStateException( + "found existing value for " + + MODE_KEY + + " for segment: " + + si.name + + "old=" + + previous + + ", new=" + + mode.name()); + } + return impl(mode).fieldsWriter(directory, si, context); + } + + StoredFieldsFormat impl(Mode mode) { + switch (mode) { + case BEST_SPEED: + return new Lucene90CompressingStoredFieldsFormat( + "Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 1024, 10); + case BEST_COMPRESSION: + return new Lucene90CompressingStoredFieldsFormat( + "Lucene87StoredFieldsHighData", + BEST_COMPRESSION_MODE, + BEST_COMPRESSION_BLOCK_LENGTH, + 4096, + 10); + default: + throw new AssertionError(); + } + } + + // Shoot for 10 sub blocks of 48kB each. + private static final int BEST_COMPRESSION_BLOCK_LENGTH = 10 * 48 * 1024; + + /** Compression mode for {@link Mode#BEST_COMPRESSION} */ + public static final CompressionMode BEST_COMPRESSION_MODE = + new DeflateWithPresetDictCompressionMode(); + + // Shoot for 10 sub blocks of 60kB each. + private static final int BEST_SPEED_BLOCK_LENGTH = 10 * 60 * 1024; + + /** Compression mode for {@link Mode#BEST_SPEED} */ + public static final CompressionMode BEST_SPEED_MODE = new LZ4WithPresetDictCompressionMode(); +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java index 8c894edcffe..80a2bb0e882 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90TermVectorsFormat.java @@ -19,9 +19,8 @@ package org.apache.lucene.codecs.lucene90; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.compressing.CompressionMode; -import org.apache.lucene.codecs.compressing.FieldsIndexWriter; -import org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsFormat; -import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.compressing.FieldsIndexWriter; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsFormat; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; @@ -29,7 +28,7 @@ import org.apache.lucene.util.packed.PackedInts; /** * Lucene 9.0 {@link TermVectorsFormat term vectors format}. * - *

Very similarly to {@link Lucene87StoredFieldsFormat}, this format is based on compressed + *

Very similarly to {@link Lucene90StoredFieldsFormat}, this format is based on compressed * chunks of data, with document-level granularity so that a document can never span across distinct * chunks. Moreover, data is made as compact as possible: * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndex.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndex.java similarity index 95% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndex.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndex.java index c4da1b08ac9..8fbacec2923 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndex.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndex.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import java.io.Closeable; import java.io.IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndexReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndexReader.java similarity index 95% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndexReader.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndexReader.java index 463c976aea3..f4469648cf7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndexReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndexReader.java @@ -14,10 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; -import static org.apache.lucene.codecs.compressing.FieldsIndexWriter.VERSION_CURRENT; -import static org.apache.lucene.codecs.compressing.FieldsIndexWriter.VERSION_START; +import static org.apache.lucene.codecs.lucene90.compressing.FieldsIndexWriter.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene90.compressing.FieldsIndexWriter.VERSION_START; import java.io.IOException; import java.io.UncheckedIOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndexWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndexWriter.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndexWriter.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndexWriter.java index d768fa3779a..b294a9958d5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/FieldsIndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/FieldsIndexWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import java.io.Closeable; import java.io.IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsFormat.java similarity index 87% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsFormat.java index bc635044cf9..3fbdbe5e540 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsFormat.java @@ -14,13 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import java.io.IOException; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.SegmentInfo; @@ -40,7 +41,7 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter; * * @lucene.experimental */ -public class CompressingStoredFieldsFormat extends StoredFieldsFormat { +public class Lucene90CompressingStoredFieldsFormat extends StoredFieldsFormat { private final String formatName; private final String segmentSuffix; @@ -50,12 +51,12 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { private final int blockShift; /** - * Create a new {@link CompressingStoredFieldsFormat} with an empty segment suffix. + * Create a new {@link Lucene90CompressingStoredFieldsFormat} with an empty segment suffix. * - * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, - * CompressionMode, int, int, int) + * @see Lucene90CompressingStoredFieldsFormat#Lucene90CompressingStoredFieldsFormat(String, + * String, CompressionMode, int, int, int) */ - public CompressingStoredFieldsFormat( + public Lucene90CompressingStoredFieldsFormat( String formatName, CompressionMode compressionMode, int chunkSize, @@ -65,7 +66,7 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { } /** - * Create a new {@link CompressingStoredFieldsFormat}. + * Create a new {@link Lucene90CompressingStoredFieldsFormat}. * *

formatName is the name of the format. This name will be used in the file * formats to perform {@link CodecUtil#checkIndexHeader codec header checks}. @@ -76,7 +77,8 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { *

The compressionMode parameter allows you to choose between compression * algorithms that have various compression and decompression speeds so that you can pick the one * that best fits your indexing and searching throughput. You should never instantiate two {@link - * CompressingStoredFieldsFormat}s that have the same name but different {@link CompressionMode}s. + * Lucene90CompressingStoredFieldsFormat}s that have the same name but different {@link + * CompressionMode}s. * *

chunkSize is the minimum byte size of a chunk of documents. A value of 1 * can make sense if there is redundancy across fields. maxDocsPerChunk is an @@ -94,7 +96,7 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { * @param blockShift the log in base 2 of number of chunks to store in an index block * @see CompressionMode */ - public CompressingStoredFieldsFormat( + public Lucene90CompressingStoredFieldsFormat( String formatName, String segmentSuffix, CompressionMode compressionMode, @@ -128,14 +130,14 @@ public class CompressingStoredFieldsFormat extends StoredFieldsFormat { @Override public StoredFieldsReader fieldsReader( Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { - return new CompressingStoredFieldsReader( + return new Lucene90CompressingStoredFieldsReader( directory, si, segmentSuffix, fn, context, formatName, compressionMode); } @Override public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { - return new CompressingStoredFieldsWriter( + return new Lucene90CompressingStoredFieldsWriter( directory, si, segmentSuffix, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java new file mode 100644 index 00000000000..fb71596086d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java @@ -0,0 +1,756 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.compressing; + +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.BYTE_ARR; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.DAY; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.DAY_ENCODING; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.FIELDS_EXTENSION; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.HOUR; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.HOUR_ENCODING; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.INDEX_CODEC_NAME; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.INDEX_EXTENSION; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.META_EXTENSION; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.META_VERSION_START; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.NUMERIC_DOUBLE; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.NUMERIC_FLOAT; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.NUMERIC_INT; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.NUMERIC_LONG; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.SECOND; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.SECOND_ENCODING; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.STRING; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.TYPE_BITS; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.TYPE_MASK; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter.VERSION_START; + +import java.io.EOFException; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Decompressor; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.packed.PackedInts; + +/** + * {@link StoredFieldsReader} impl for {@link Lucene90CompressingStoredFieldsFormat}. + * + * @lucene.experimental + */ +public final class Lucene90CompressingStoredFieldsReader extends StoredFieldsReader { + + private final int version; + private final FieldInfos fieldInfos; + private final FieldsIndex indexReader; + private final long maxPointer; + private final IndexInput fieldsStream; + private final int chunkSize; + private final int packedIntsVersion; + private final CompressionMode compressionMode; + private final Decompressor decompressor; + private final int numDocs; + private final boolean merging; + private final BlockState state; + private final long numDirtyChunks; // number of incomplete compressed blocks written + private final long numDirtyDocs; // cumulative number of missing docs in incomplete chunks + private boolean closed; + + // used by clone + private Lucene90CompressingStoredFieldsReader( + Lucene90CompressingStoredFieldsReader reader, boolean merging) { + this.version = reader.version; + this.fieldInfos = reader.fieldInfos; + this.fieldsStream = reader.fieldsStream.clone(); + this.indexReader = reader.indexReader.clone(); + this.maxPointer = reader.maxPointer; + this.chunkSize = reader.chunkSize; + this.packedIntsVersion = reader.packedIntsVersion; + this.compressionMode = reader.compressionMode; + this.decompressor = reader.decompressor.clone(); + this.numDocs = reader.numDocs; + this.numDirtyChunks = reader.numDirtyChunks; + this.numDirtyDocs = reader.numDirtyDocs; + this.merging = merging; + this.state = new BlockState(); + this.closed = false; + } + + /** Sole constructor. */ + public Lucene90CompressingStoredFieldsReader( + Directory d, + SegmentInfo si, + String segmentSuffix, + FieldInfos fn, + IOContext context, + String formatName, + CompressionMode compressionMode) + throws IOException { + this.compressionMode = compressionMode; + final String segment = si.name; + boolean success = false; + fieldInfos = fn; + numDocs = si.maxDoc(); + + final String fieldsStreamFN = + IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION); + ChecksumIndexInput metaIn = null; + try { + // Open the data file + fieldsStream = d.openInput(fieldsStreamFN, context); + version = + CodecUtil.checkIndexHeader( + fieldsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); + assert CodecUtil.indexHeaderLength(formatName, segmentSuffix) + == fieldsStream.getFilePointer(); + + final String metaStreamFN = + IndexFileNames.segmentFileName(segment, segmentSuffix, META_EXTENSION); + metaIn = d.openChecksumInput(metaStreamFN, IOContext.READONCE); + CodecUtil.checkIndexHeader( + metaIn, + INDEX_CODEC_NAME + "Meta", + META_VERSION_START, + version, + si.getId(), + segmentSuffix); + + chunkSize = metaIn.readVInt(); + packedIntsVersion = metaIn.readVInt(); + + decompressor = compressionMode.newDecompressor(); + this.merging = false; + this.state = new BlockState(); + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(fieldsStream); + + long maxPointer = -1; + FieldsIndex indexReader = null; + + FieldsIndexReader fieldsIndexReader = + new FieldsIndexReader( + d, si.name, segmentSuffix, INDEX_EXTENSION, INDEX_CODEC_NAME, si.getId(), metaIn); + indexReader = fieldsIndexReader; + maxPointer = fieldsIndexReader.getMaxPointer(); + + this.maxPointer = maxPointer; + this.indexReader = indexReader; + + numDirtyChunks = metaIn.readVLong(); + numDirtyDocs = metaIn.readVLong(); + + if (metaIn != null) { + CodecUtil.checkFooter(metaIn, null); + metaIn.close(); + } + + success = true; + } catch (Throwable t) { + if (metaIn != null) { + CodecUtil.checkFooter(metaIn, t); + throw new AssertionError("unreachable"); + } else { + throw t; + } + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this, metaIn); + } + } + } + + /** @throws AlreadyClosedException if this FieldsReader is closed */ + private void ensureOpen() throws AlreadyClosedException { + if (closed) { + throw new AlreadyClosedException("this FieldsReader is closed"); + } + } + + /** Close the underlying {@link IndexInput}s. */ + @Override + public void close() throws IOException { + if (!closed) { + IOUtils.close(indexReader, fieldsStream); + closed = true; + } + } + + private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) + throws IOException { + switch (bits & TYPE_MASK) { + case BYTE_ARR: + int length = in.readVInt(); + byte[] data = new byte[length]; + in.readBytes(data, 0, length); + visitor.binaryField(info, data); + break; + case STRING: + visitor.stringField(info, in.readString()); + break; + case NUMERIC_INT: + visitor.intField(info, in.readZInt()); + break; + case NUMERIC_FLOAT: + visitor.floatField(info, readZFloat(in)); + break; + case NUMERIC_LONG: + visitor.longField(info, readTLong(in)); + break; + case NUMERIC_DOUBLE: + visitor.doubleField(info, readZDouble(in)); + break; + default: + throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); + } + } + + private static void skipField(DataInput in, int bits) throws IOException { + switch (bits & TYPE_MASK) { + case BYTE_ARR: + case STRING: + final int length = in.readVInt(); + in.skipBytes(length); + break; + case NUMERIC_INT: + in.readZInt(); + break; + case NUMERIC_FLOAT: + readZFloat(in); + break; + case NUMERIC_LONG: + readTLong(in); + break; + case NUMERIC_DOUBLE: + readZDouble(in); + break; + default: + throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); + } + } + + /** + * Reads a float in a variable-length format. Reads between one and five bytes. Small integral + * values typically take fewer bytes. + */ + static float readZFloat(DataInput in) throws IOException { + int b = in.readByte() & 0xFF; + if (b == 0xFF) { + // negative value + return Float.intBitsToFloat(in.readInt()); + } else if ((b & 0x80) != 0) { + // small integer [-1..125] + return (b & 0x7f) - 1; + } else { + // positive float + int bits = b << 24 | ((in.readShort() & 0xFFFF) << 8) | (in.readByte() & 0xFF); + return Float.intBitsToFloat(bits); + } + } + + /** + * Reads a double in a variable-length format. Reads between one and nine bytes. Small integral + * values typically take fewer bytes. + */ + static double readZDouble(DataInput in) throws IOException { + int b = in.readByte() & 0xFF; + if (b == 0xFF) { + // negative value + return Double.longBitsToDouble(in.readLong()); + } else if (b == 0xFE) { + // float + return Float.intBitsToFloat(in.readInt()); + } else if ((b & 0x80) != 0) { + // small integer [-1..124] + return (b & 0x7f) - 1; + } else { + // positive double + long bits = + ((long) b) << 56 + | ((in.readInt() & 0xFFFFFFFFL) << 24) + | ((in.readShort() & 0xFFFFL) << 8) + | (in.readByte() & 0xFFL); + return Double.longBitsToDouble(bits); + } + } + + /** + * Reads a long in a variable-length format. Reads between one andCorePropLo nine bytes. Small + * values typically take fewer bytes. + */ + static long readTLong(DataInput in) throws IOException { + int header = in.readByte() & 0xFF; + + long bits = header & 0x1F; + if ((header & 0x20) != 0) { + // continuation bit + bits |= in.readVLong() << 5; + } + + long l = BitUtil.zigZagDecode(bits); + + switch (header & DAY_ENCODING) { + case SECOND_ENCODING: + l *= SECOND; + break; + case HOUR_ENCODING: + l *= HOUR; + break; + case DAY_ENCODING: + l *= DAY; + break; + case 0: + // uncompressed + break; + default: + throw new AssertionError(); + } + + return l; + } + + /** + * A serialized document, you need to decode its input in order to get an actual {@link Document}. + */ + static class SerializedDocument { + + // the serialized data + final DataInput in; + + // the number of bytes on which the document is encoded + final int length; + + // the number of stored fields + final int numStoredFields; + + private SerializedDocument(DataInput in, int length, int numStoredFields) { + this.in = in; + this.length = length; + this.numStoredFields = numStoredFields; + } + } + + /** Keeps state about the current block of documents. */ + private class BlockState { + + private int docBase, chunkDocs; + + // whether the block has been sliced, this happens for large documents + private boolean sliced; + + private long[] offsets = LongsRef.EMPTY_LONGS; + private long[] numStoredFields = LongsRef.EMPTY_LONGS; + + // the start pointer at which you can read the compressed documents + private long startPointer; + + private final BytesRef spare; + private final BytesRef bytes; + + BlockState() { + if (merging) { + spare = new BytesRef(); + bytes = new BytesRef(); + } else { + spare = bytes = null; + } + } + + boolean contains(int docID) { + return docID >= docBase && docID < docBase + chunkDocs; + } + + /** Reset this block so that it stores state for the block that contains the given doc id. */ + void reset(int docID) throws IOException { + boolean success = false; + try { + doReset(docID); + success = true; + } finally { + if (success == false) { + // if the read failed, set chunkDocs to 0 so that it does not + // contain any docs anymore and is not reused. This should help + // get consistent exceptions when trying to get several + // documents which are in the same corrupted block since it will + // force the header to be decoded again + chunkDocs = 0; + } + } + } + + private void doReset(int docID) throws IOException { + docBase = fieldsStream.readVInt(); + final int token = fieldsStream.readVInt(); + chunkDocs = token >>> 1; + if (contains(docID) == false || docBase + chunkDocs > numDocs) { + throw new CorruptIndexException( + "Corrupted: docID=" + + docID + + ", docBase=" + + docBase + + ", chunkDocs=" + + chunkDocs + + ", numDocs=" + + numDocs, + fieldsStream); + } + + sliced = (token & 1) != 0; + + offsets = ArrayUtil.grow(offsets, chunkDocs + 1); + numStoredFields = ArrayUtil.grow(numStoredFields, chunkDocs); + + if (chunkDocs == 1) { + numStoredFields[0] = fieldsStream.readVInt(); + offsets[1] = fieldsStream.readVInt(); + } else { + // Number of stored fields per document + final int bitsPerStoredFields = fieldsStream.readVInt(); + if (bitsPerStoredFields == 0) { + Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt()); + } else if (bitsPerStoredFields > 31) { + throw new CorruptIndexException( + "bitsPerStoredFields=" + bitsPerStoredFields, fieldsStream); + } else { + final PackedInts.ReaderIterator it = + PackedInts.getReaderIteratorNoHeader( + fieldsStream, + PackedInts.Format.PACKED, + packedIntsVersion, + chunkDocs, + bitsPerStoredFields, + 1024); + for (int i = 0; i < chunkDocs; ) { + final LongsRef next = it.next(Integer.MAX_VALUE); + System.arraycopy(next.longs, next.offset, numStoredFields, i, next.length); + i += next.length; + } + } + + // The stream encodes the length of each document and we decode + // it into a list of monotonically increasing offsets + final int bitsPerLength = fieldsStream.readVInt(); + if (bitsPerLength == 0) { + final int length = fieldsStream.readVInt(); + for (int i = 0; i < chunkDocs; ++i) { + offsets[1 + i] = (1 + i) * length; + } + } else if (bitsPerStoredFields > 31) { + throw new CorruptIndexException("bitsPerLength=" + bitsPerLength, fieldsStream); + } else { + final PackedInts.ReaderIterator it = + PackedInts.getReaderIteratorNoHeader( + fieldsStream, + PackedInts.Format.PACKED, + packedIntsVersion, + chunkDocs, + bitsPerLength, + 1024); + for (int i = 0; i < chunkDocs; ) { + final LongsRef next = it.next(Integer.MAX_VALUE); + System.arraycopy(next.longs, next.offset, offsets, i + 1, next.length); + i += next.length; + } + for (int i = 0; i < chunkDocs; ++i) { + offsets[i + 1] += offsets[i]; + } + } + + // Additional validation: only the empty document has a serialized length of 0 + for (int i = 0; i < chunkDocs; ++i) { + final long len = offsets[i + 1] - offsets[i]; + final long storedFields = numStoredFields[i]; + if ((len == 0) != (storedFields == 0)) { + throw new CorruptIndexException( + "length=" + len + ", numStoredFields=" + storedFields, fieldsStream); + } + } + } + + startPointer = fieldsStream.getFilePointer(); + + if (merging) { + final int totalLength = Math.toIntExact(offsets[chunkDocs]); + // decompress eagerly + if (sliced) { + bytes.offset = bytes.length = 0; + for (int decompressed = 0; decompressed < totalLength; ) { + final int toDecompress = Math.min(totalLength - decompressed, chunkSize); + decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare); + bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length); + System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length); + bytes.length += spare.length; + decompressed += toDecompress; + } + } else { + decompressor.decompress(fieldsStream, totalLength, 0, totalLength, bytes); + } + if (bytes.length != totalLength) { + throw new CorruptIndexException( + "Corrupted: expected chunk size = " + totalLength + ", got " + bytes.length, + fieldsStream); + } + } + } + + /** + * Get the serialized representation of the given docID. This docID has to be contained in the + * current block. + */ + SerializedDocument document(int docID) throws IOException { + if (contains(docID) == false) { + throw new IllegalArgumentException(); + } + + final int index = docID - docBase; + final int offset = Math.toIntExact(offsets[index]); + final int length = Math.toIntExact(offsets[index + 1]) - offset; + final int totalLength = Math.toIntExact(offsets[chunkDocs]); + final int numStoredFields = Math.toIntExact(this.numStoredFields[index]); + + final BytesRef bytes; + if (merging) { + bytes = this.bytes; + } else { + bytes = new BytesRef(); + } + + final DataInput documentInput; + if (length == 0) { + // empty + documentInput = new ByteArrayDataInput(); + } else if (merging) { + // already decompressed + documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset + offset, length); + } else if (sliced) { + fieldsStream.seek(startPointer); + decompressor.decompress( + fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes); + documentInput = + new DataInput() { + + int decompressed = bytes.length; + + void fillBuffer() throws IOException { + assert decompressed <= length; + if (decompressed == length) { + throw new EOFException(); + } + final int toDecompress = Math.min(length - decompressed, chunkSize); + decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes); + decompressed += toDecompress; + } + + @Override + public byte readByte() throws IOException { + if (bytes.length == 0) { + fillBuffer(); + } + --bytes.length; + return bytes.bytes[bytes.offset++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + while (len > bytes.length) { + System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length); + len -= bytes.length; + offset += bytes.length; + fillBuffer(); + } + System.arraycopy(bytes.bytes, bytes.offset, b, offset, len); + bytes.offset += len; + bytes.length -= len; + } + + @Override + public void skipBytes(long numBytes) throws IOException { + if (numBytes < 0) { + throw new IllegalArgumentException("numBytes must be >= 0, got " + numBytes); + } + while (numBytes > bytes.length) { + numBytes -= bytes.length; + fillBuffer(); + } + bytes.offset += numBytes; + bytes.length -= numBytes; + } + }; + } else { + fieldsStream.seek(startPointer); + decompressor.decompress(fieldsStream, totalLength, offset, length, bytes); + assert bytes.length == length; + documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); + } + + return new SerializedDocument(documentInput, length, numStoredFields); + } + } + + SerializedDocument document(int docID) throws IOException { + if (state.contains(docID) == false) { + fieldsStream.seek(indexReader.getStartPointer(docID)); + state.reset(docID); + } + assert state.contains(docID); + return state.document(docID); + } + + @Override + public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException { + + final SerializedDocument doc = document(docID); + + for (int fieldIDX = 0; fieldIDX < doc.numStoredFields; fieldIDX++) { + final long infoAndBits = doc.in.readVLong(); + final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS); + final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + + final int bits = (int) (infoAndBits & TYPE_MASK); + assert bits <= NUMERIC_DOUBLE : "bits=" + Integer.toHexString(bits); + + switch (visitor.needsField(fieldInfo)) { + case YES: + readField(doc.in, visitor, fieldInfo, bits); + break; + case NO: + if (fieldIDX + == doc.numStoredFields - 1) { // don't skipField on last field value; treat like STOP + return; + } + skipField(doc.in, bits); + break; + case STOP: + return; + } + } + } + + @Override + public StoredFieldsReader clone() { + ensureOpen(); + return new Lucene90CompressingStoredFieldsReader(this, false); + } + + @Override + public StoredFieldsReader getMergeInstance() { + ensureOpen(); + return new Lucene90CompressingStoredFieldsReader(this, true); + } + + int getVersion() { + return version; + } + + CompressionMode getCompressionMode() { + return compressionMode; + } + + FieldsIndex getIndexReader() { + return indexReader; + } + + long getMaxPointer() { + return maxPointer; + } + + IndexInput getFieldsStream() { + return fieldsStream; + } + + int getChunkSize() { + return chunkSize; + } + + long getNumDirtyDocs() { + if (version != VERSION_CURRENT) { + throw new IllegalStateException( + "getNumDirtyDocs should only ever get called when the reader is on the current version"); + } + assert numDirtyDocs >= 0; + return numDirtyDocs; + } + + long getNumDirtyChunks() { + if (version != VERSION_CURRENT) { + throw new IllegalStateException( + "getNumDirtyChunks should only ever get called when the reader is on the current version"); + } + assert numDirtyChunks >= 0; + return numDirtyChunks; + } + + int getNumDocs() { + return numDocs; + } + + int getPackedIntsVersion() { + return packedIntsVersion; + } + + @Override + public long ramBytesUsed() { + return indexReader.ramBytesUsed(); + } + + @Override + public Collection getChildResources() { + return Collections.singleton(Accountables.namedAccountable("stored field index", indexReader)); + } + + @Override + public void checkIntegrity() throws IOException { + indexReader.checkIntegrity(); + CodecUtil.checksumEntireFile(fieldsStream); + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(mode=" + + compressionMode + + ",chunksize=" + + chunkSize + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java similarity index 94% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java index a74df9c60b6..815bbc0eaa8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @@ -24,7 +24,10 @@ import java.util.List; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; -import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Compressor; +import org.apache.lucene.codecs.compressing.MatchingReaders; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.SerializedDocument; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.FieldInfo; @@ -47,11 +50,11 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; /** - * {@link StoredFieldsWriter} impl for {@link CompressingStoredFieldsFormat}. + * {@link StoredFieldsWriter} impl for {@link Lucene90CompressingStoredFieldsFormat}. * * @lucene.experimental */ -public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { +public final class Lucene90CompressingStoredFieldsWriter extends StoredFieldsWriter { /** Extension of stored fields file */ public static final String FIELDS_EXTENSION = "fdt"; @@ -60,7 +63,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { /** Extension of stored fields meta */ public static final String META_EXTENSION = "fdm"; /** Codec name for the index. */ - public static final String INDEX_CODEC_NAME = "Lucene85FieldsIndex"; + public static final String INDEX_CODEC_NAME = "Lucene90FieldsIndex"; static final int STRING = 0x00; static final int BYTE_ARR = 0x01; @@ -73,11 +76,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { static final int TYPE_MASK = (int) PackedInts.maxValue(TYPE_BITS); static final int VERSION_START = 1; - static final int VERSION_OFFHEAP_INDEX = 2; - /** Version where all metadata were moved to the meta file. */ - static final int VERSION_META = 3; - - static final int VERSION_CURRENT = VERSION_META; + static final int VERSION_CURRENT = VERSION_START; static final int META_VERSION_START = 0; private final String segment; @@ -99,7 +98,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { private long numDirtyDocs; // cumulative number of missing docs in incomplete chunks /** Sole constructor. */ - CompressingStoredFieldsWriter( + Lucene90CompressingStoredFieldsWriter( Directory directory, SegmentInfo si, String segmentSuffix, @@ -514,7 +513,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { // we try to be extra safe with this impl, but add an escape hatch to // have a workaround for undiscovered bugs. static final String BULK_MERGE_ENABLED_SYSPROP = - CompressingStoredFieldsWriter.class.getName() + ".enableBulkMerge"; + Lucene90CompressingStoredFieldsWriter.class.getName() + ".enableBulkMerge"; static final boolean BULK_MERGE_ENABLED; static { @@ -540,9 +539,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { List subs = new ArrayList<>(); for (int i = 0; i < mergeState.storedFieldsReaders.length; i++) { if (matching.matchingReaders[i] - && mergeState.storedFieldsReaders[i] instanceof CompressingStoredFieldsReader) { - CompressingStoredFieldsReader storedFieldsReader = - (CompressingStoredFieldsReader) mergeState.storedFieldsReaders[i]; + && mergeState.storedFieldsReaders[i] instanceof Lucene90CompressingStoredFieldsReader) { + Lucene90CompressingStoredFieldsReader storedFieldsReader = + (Lucene90CompressingStoredFieldsReader) mergeState.storedFieldsReaders[i]; storedFieldsReader.checkIntegrity(); subs.add( new CompressingStoredFieldsMergeSub( @@ -572,12 +571,12 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) { MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex); - CompressingStoredFieldsReader matchingFieldsReader = null; + Lucene90CompressingStoredFieldsReader matchingFieldsReader = null; if (matching.matchingReaders[readerIndex]) { final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex]; // we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader - if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) { - matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader; + if (fieldsReader != null && fieldsReader instanceof Lucene90CompressingStoredFieldsReader) { + matchingFieldsReader = (Lucene90CompressingStoredFieldsReader) fieldsReader; } } @@ -709,19 +708,19 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { * some worst-case situations (e.g. frequent reopen with tiny flushes), over time the compression * ratio can degrade. This is a safety switch. */ - boolean tooDirty(CompressingStoredFieldsReader candidate) { + boolean tooDirty(Lucene90CompressingStoredFieldsReader candidate) { // more than 1% dirty, or more than hard limit of 1024 dirty chunks return candidate.getNumDirtyChunks() > 1024 || candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs(); } private static class CompressingStoredFieldsMergeSub extends DocIDMerger.Sub { - private final CompressingStoredFieldsReader reader; + private final Lucene90CompressingStoredFieldsReader reader; private final int maxDoc; int docID = -1; CompressingStoredFieldsMergeSub( - CompressingStoredFieldsReader reader, MergeState.DocMap docMap, int maxDoc) { + Lucene90CompressingStoredFieldsReader reader, MergeState.DocMap docMap, int maxDoc) { super(docMap); this.maxDoc = maxDoc; this.reader = reader; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsFormat.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsFormat.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsFormat.java index 13e070848d0..964259ebcc0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsFormat.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import java.io.IOException; import org.apache.lucene.codecs.CodecUtil; @@ -22,6 +22,7 @@ import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.Directory; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java similarity index 96% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsReader.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java index c31a37f371d..6d501eca930 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java @@ -14,20 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.FLAGS_BITS; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.META_VERSION_START; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.OFFSETS; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.PACKED_BLOCK_SIZE; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.PAYLOADS; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.POSITIONS; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_EXTENSION; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_INDEX_CODEC_NAME; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_META_EXTENSION; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VERSION_CURRENT; -import static org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsWriter.VERSION_START; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.FLAGS_BITS; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.META_VERSION_START; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.OFFSETS; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.PACKED_BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.PAYLOADS; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.POSITIONS; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_EXTENSION; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_INDEX_CODEC_NAME; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.VECTORS_META_EXTENSION; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsWriter.VERSION_START; import java.io.Closeable; import java.io.IOException; @@ -37,6 +37,8 @@ import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Decompressor; import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsWriter.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java index 4fc88c362c6..9908ccf78de 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Lucene90CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import java.io.IOException; import java.util.ArrayDeque; @@ -28,6 +28,9 @@ import java.util.TreeSet; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; +import org.apache.lucene.codecs.compressing.CompressionMode; +import org.apache.lucene.codecs.compressing.Compressor; +import org.apache.lucene.codecs.compressing.MatchingReaders; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/package-info.java similarity index 80% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/package-info.java index 13339d301a2..154217ef5ce 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/package-info.java @@ -15,8 +15,5 @@ * limitations under the License. */ -/** - * Components from the Lucene 8.7 index format. See {@link org.apache.lucene.codecs.lucene90} for an - * overview of the current index format. - */ -package org.apache.lucene.codecs.lucene87; +/** Lucene 9.0 compressing format. */ +package org.apache.lucene.codecs.lucene90.compressing; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java index dcab523beb5..a9dceef8606 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java @@ -146,7 +146,7 @@ * information about how the segment is sorted *

  • {@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}. This * contains metadata about the set of named fields used in the index. - *
  • {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Stored Field values}. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. * This contains, for each document, a list of attribute-value pairs, where the attributes are * field names. These are used to store auxiliary information about the document, such as its * title, url, or an identifier to access a database. The set of stored fields are what is @@ -245,12 +245,12 @@ * Stores information about the fields * * - * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index} + * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index} * .fdx * Contains pointers to field data * * - * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data} + * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data} * .fdt * The stored fields for documents * diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java index 80e7169d4e4..6e7bc41622d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java @@ -26,10 +26,10 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; -import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat; import org.apache.lucene.codecs.compressing.CompressionMode; import org.apache.lucene.codecs.compressing.Compressor; import org.apache.lucene.codecs.compressing.Decompressor; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsFormat; import org.apache.lucene.document.StoredField; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -79,7 +79,8 @@ final class SortingStoredFieldsConsumer extends StoredFieldsConsumer { } }; private static final StoredFieldsFormat TEMP_STORED_FIELDS_FORMAT = - new CompressingStoredFieldsFormat("TempStoredFields", NO_COMPRESSION, 128 * 1024, 1, 10); + new Lucene90CompressingStoredFieldsFormat( + "TempStoredFields", NO_COMPRESSION, 128 * 1024, 1, 10); TrackingTmpOutputDirectoryWrapper tmpDirectory; SortingStoredFieldsConsumer(Codec codec, Directory directory, SegmentInfo info) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java index d6b72f85b1f..0637f8d546a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java @@ -24,7 +24,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; -import org.apache.lucene.codecs.compressing.Lucene90CompressingTermVectorsFormat; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsFormat; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FlushInfo; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormat.java new file mode 100644 index 00000000000..e2e0ec69c6c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormat.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestLucene90StoredFieldsFormat extends BaseStoredFieldsFormatTestCase { + @Override + protected Codec getCodec() { + return TestUtil.getDefaultCodec(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java new file mode 100644 index 00000000000..14eed8eb537 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene90.Lucene90Codec.Mode; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; + +public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { + @Override + protected Codec getCodec() { + return new Lucene90Codec(Mode.BEST_COMPRESSION); + } + + /** + * Change compression params (leaving it the same for old segments) and tests that nothing breaks. + */ + public void testMixedCompressions() throws Exception { + Directory dir = newDirectory(); + for (int i = 0; i < 10; i++) { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values()))); + IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new StoredField("field1", "value1")); + doc.add(new StoredField("field2", "value2")); + iw.addDocument(doc); + if (random().nextInt(4) == 0) { + iw.forceMerge(1); + } + iw.commit(); + iw.close(); + } + + DirectoryReader ir = DirectoryReader.open(dir); + assertEquals(10, ir.numDocs()); + for (int i = 0; i < 10; i++) { + Document doc = ir.document(i); + assertEquals("value1", doc.get("field1")); + assertEquals("value2", doc.get("field2")); + } + ir.close(); + // checkindex + dir.close(); + } + + public void testInvalidOptions() { + expectThrows( + NullPointerException.class, + () -> { + new Lucene90Codec(null); + }); + + expectThrows( + NullPointerException.class, + () -> { + new Lucene90StoredFieldsFormat(null); + }); + } + + public void testShowJDKBugStatus() { + System.err.println("JDK is buggy (JDK-8252739): " + BugfixDeflater_JDK8252739.IS_BUGGY_JDK); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatMergeInstance.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatMergeInstance.java new file mode 100644 index 00000000000..4ca9e44ed3b --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatMergeInstance.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +/** Test the merge instance of the Lucene90 stored fields format. */ +public class TestLucene90StoredFieldsFormatMergeInstance extends TestLucene90StoredFieldsFormat { + + @Override + protected boolean shouldTestMergeInstance() { + return true; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java index a9c64cebdb6..d447976c0c5 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestFileSwitchDirectory.java @@ -27,7 +27,7 @@ import java.util.HashSet; import java.util.Set; import java.util.function.Function; import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexNotFoundException; import org.apache.lucene.index.IndexReader; @@ -44,7 +44,7 @@ public class TestFileSwitchDirectory extends BaseDirectoryTestCase { /** Test if writing doc stores to disk and everything else to ram works. */ public void testBasic() throws IOException { Set fileExtensions = new HashSet<>(); - fileExtensions.add(CompressingStoredFieldsWriter.FIELDS_EXTENSION); + fileExtensions.add(Lucene90CompressingStoredFieldsWriter.FIELDS_EXTENSION); fileExtensions.add("fdx"); fileExtensions.add("fdm"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java index cc76011a0b4..b2594197503 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java @@ -22,12 +22,14 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingTermVectorsFormat; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.packed.DirectMonotonicWriter; /** - * A codec that uses {@link CompressingStoredFieldsFormat} for its stored fields and delegates to - * the default codec for everything else. + * A codec that uses {@link Lucene90CompressingStoredFieldsFormat} for its stored fields and + * delegates to the default codec for everything else. */ public abstract class CompressingCodec extends FilterCodec { @@ -101,7 +103,7 @@ public abstract class CompressingCodec extends FilterCodec { RandomNumbers.randomIntBetween(random, 1, 1024)); } - private final CompressingStoredFieldsFormat storedFieldsFormat; + private final Lucene90CompressingStoredFieldsFormat storedFieldsFormat; private final Lucene90CompressingTermVectorsFormat termVectorsFormat; /** Creates a compressing codec with a given segment suffix */ @@ -114,7 +116,7 @@ public abstract class CompressingCodec extends FilterCodec { int blockShift) { super(name, TestUtil.getDefaultCodec()); this.storedFieldsFormat = - new CompressingStoredFieldsFormat( + new Lucene90CompressingStoredFieldsFormat( name, segmentSuffix, compressionMode, chunkSize, maxDocsPerChunk, blockShift); this.termVectorsFormat = new Lucene90CompressingTermVectorsFormat( diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DeflateWithPresetCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DeflateWithPresetCompressingCodec.java index dbc21e3748f..eb99488c888 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DeflateWithPresetCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/DeflateWithPresetCompressingCodec.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.codecs.compressing; -import org.apache.lucene.codecs.lucene87.DeflateWithPresetDictCompressionMode; +import org.apache.lucene.codecs.lucene90.DeflateWithPresetDictCompressionMode; /** CompressionCodec that uses {@link DeflateWithPresetDictCompressionMode}. */ public class DeflateWithPresetCompressingCodec extends CompressingCodec { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/LZ4WithPresetCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/LZ4WithPresetCompressingCodec.java index a51493b45e1..e1e12b7dd2c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/LZ4WithPresetCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/LZ4WithPresetCompressingCodec.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.codecs.compressing; -import org.apache.lucene.codecs.lucene87.LZ4WithPresetDictCompressionMode; +import org.apache.lucene.codecs.lucene90.LZ4WithPresetDictCompressionMode; /** CompressionCodec that uses {@link LZ4WithPresetDictCompressionMode}. */ public class LZ4WithPresetCompressingCodec extends CompressingCodec { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/package.html index dee90944e23..19a617e22c9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/package.html +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/package.html @@ -21,6 +21,6 @@ -Support for testing {@link org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat}. +Support for testing {@link org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsFormat}. diff --git a/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java similarity index 86% rename from lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java rename to lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java index ede3803cc0f..c96c9734af1 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingStoredFieldsFormat.java +++ b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java @@ -14,13 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.compressing.CompressingCodec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -105,9 +106,9 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes // round-trip small integer values for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { float f = (float) i; - CompressingStoredFieldsWriter.writeZFloat(out, f); + Lucene90CompressingStoredFieldsWriter.writeZFloat(out, f); in.reset(buffer, 0, out.getPosition()); - float g = CompressingStoredFieldsReader.readZFloat(in); + float g = Lucene90CompressingStoredFieldsReader.readZFloat(in); assertTrue(in.eof()); assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g)); @@ -130,9 +131,9 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes }; for (float f : special) { - CompressingStoredFieldsWriter.writeZFloat(out, f); + Lucene90CompressingStoredFieldsWriter.writeZFloat(out, f); in.reset(buffer, 0, out.getPosition()); - float g = CompressingStoredFieldsReader.readZFloat(in); + float g = Lucene90CompressingStoredFieldsReader.readZFloat(in); assertTrue(in.eof()); assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g)); out.reset(buffer); @@ -142,12 +143,12 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes Random r = random(); for (int i = 0; i < 100000; i++) { float f = r.nextFloat() * (random().nextInt(100) - 50); - CompressingStoredFieldsWriter.writeZFloat(out, f); + Lucene90CompressingStoredFieldsWriter.writeZFloat(out, f); assertTrue( "length=" + out.getPosition() + ", f=" + f, out.getPosition() <= ((Float.floatToIntBits(f) >>> 31) == 1 ? 5 : 4)); in.reset(buffer, 0, out.getPosition()); - float g = CompressingStoredFieldsReader.readZFloat(in); + float g = Lucene90CompressingStoredFieldsReader.readZFloat(in); assertTrue(in.eof()); assertEquals(Float.floatToIntBits(f), Float.floatToIntBits(g)); out.reset(buffer); @@ -162,9 +163,9 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes // round-trip small integer values for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { double x = (double) i; - CompressingStoredFieldsWriter.writeZDouble(out, x); + Lucene90CompressingStoredFieldsWriter.writeZDouble(out, x); in.reset(buffer, 0, out.getPosition()); - double y = CompressingStoredFieldsReader.readZDouble(in); + double y = Lucene90CompressingStoredFieldsReader.readZDouble(in); assertTrue(in.eof()); assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); @@ -187,9 +188,9 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes }; for (double x : special) { - CompressingStoredFieldsWriter.writeZDouble(out, x); + Lucene90CompressingStoredFieldsWriter.writeZDouble(out, x); in.reset(buffer, 0, out.getPosition()); - double y = CompressingStoredFieldsReader.readZDouble(in); + double y = Lucene90CompressingStoredFieldsReader.readZDouble(in); assertTrue(in.eof()); assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); out.reset(buffer); @@ -199,10 +200,10 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes Random r = random(); for (int i = 0; i < 100000; i++) { double x = r.nextDouble() * (random().nextInt(100) - 50); - CompressingStoredFieldsWriter.writeZDouble(out, x); + Lucene90CompressingStoredFieldsWriter.writeZDouble(out, x); assertTrue("length=" + out.getPosition() + ", d=" + x, out.getPosition() <= (x < 0 ? 9 : 8)); in.reset(buffer, 0, out.getPosition()); - double y = CompressingStoredFieldsReader.readZDouble(in); + double y = Lucene90CompressingStoredFieldsReader.readZDouble(in); assertTrue(in.eof()); assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); out.reset(buffer); @@ -211,10 +212,10 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes // same with floats for (int i = 0; i < 100000; i++) { double x = (double) (r.nextFloat() * (random().nextInt(100) - 50)); - CompressingStoredFieldsWriter.writeZDouble(out, x); + Lucene90CompressingStoredFieldsWriter.writeZDouble(out, x); assertTrue("length=" + out.getPosition() + ", d=" + x, out.getPosition() <= 5); in.reset(buffer, 0, out.getPosition()); - double y = CompressingStoredFieldsReader.readZDouble(in); + double y = Lucene90CompressingStoredFieldsReader.readZDouble(in); assertTrue(in.eof()); assertEquals(Double.doubleToLongBits(x), Double.doubleToLongBits(y)); out.reset(buffer); @@ -230,9 +231,9 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes for (int i = Short.MIN_VALUE; i < Short.MAX_VALUE; i++) { for (long mul : new long[] {SECOND, HOUR, DAY}) { long l1 = (long) i * mul; - CompressingStoredFieldsWriter.writeTLong(out, l1); + Lucene90CompressingStoredFieldsWriter.writeTLong(out, l1); in.reset(buffer, 0, out.getPosition()); - long l2 = CompressingStoredFieldsReader.readTLong(in); + long l2 = Lucene90CompressingStoredFieldsReader.readTLong(in); assertTrue(in.eof()); assertEquals(l1, l2); @@ -262,9 +263,9 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes default: break; } - CompressingStoredFieldsWriter.writeTLong(out, l1); + Lucene90CompressingStoredFieldsWriter.writeTLong(out, l1); in.reset(buffer, 0, out.getPosition()); - long l2 = CompressingStoredFieldsReader.readTLong(in); + long l2 = Lucene90CompressingStoredFieldsReader.readTLong(in); assertTrue(in.eof()); assertEquals(l1, l2); out.reset(buffer); @@ -297,7 +298,8 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes // examine dirty counts: for (LeafReaderContext leaf : ir2.leaves()) { CodecReader sr = (CodecReader) leaf.reader(); - CompressingStoredFieldsReader reader = (CompressingStoredFieldsReader) sr.getFieldsReader(); + Lucene90CompressingStoredFieldsReader reader = + (Lucene90CompressingStoredFieldsReader) sr.getFieldsReader(); assertTrue(reader.getNumDirtyDocs() > 0); assertTrue(reader.getNumDirtyDocs() < 100); // can't be gte the number of docs per chunk assertEquals(1, reader.getNumDirtyChunks()); @@ -310,7 +312,8 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes ir.close(); ir = ir2; CodecReader sr = (CodecReader) getOnlyLeafReader(ir); - CompressingStoredFieldsReader reader = (CompressingStoredFieldsReader) sr.getFieldsReader(); + Lucene90CompressingStoredFieldsReader reader = + (Lucene90CompressingStoredFieldsReader) sr.getFieldsReader(); // we could get lucky, and have zero, but typically one. assertTrue(reader.getNumDirtyChunks() <= 1); ir.close(); diff --git a/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java similarity index 97% rename from lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java rename to lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java index 1c8f1d00e9e..02924be5ae2 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java +++ b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java @@ -14,11 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.compressing; +package org.apache.lucene.codecs.lucene90.compressing; import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.compressing.CompressingCodec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; diff --git a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java index 5ac97644d20..cd86d4fdc98 100644 --- a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java +++ b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java @@ -20,8 +20,8 @@ import java.io.IOException; import java.util.Map; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90Codec.Mode; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.index.SegmentInfo; @@ -117,11 +117,11 @@ public class TestCodecSupport extends SolrTestCaseJ4 { SegmentInfos infos = SegmentInfos.readLatestCommit(searcher.getIndexReader().directory()); SegmentInfo info = infos.info(infos.size() - 1).info; assertEquals("Expecting compression mode string to be " + expectedModeString + - " but got: " + info.getAttribute(Lucene87StoredFieldsFormat.MODE_KEY) + + " but got: " + info.getAttribute(Lucene90StoredFieldsFormat.MODE_KEY) + "\n SegmentInfo: " + info + "\n SegmentInfos: " + infos + "\n Codec: " + core.getCodec(), - expectedModeString, info.getAttribute(Lucene87StoredFieldsFormat.MODE_KEY)); + expectedModeString, info.getAttribute(Lucene90StoredFieldsFormat.MODE_KEY)); return null; }); }