diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 887e91f45b0..83e418edbaa 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -128,6 +128,11 @@ New Features * LUCENE-5825: Benchmark module can use custom postings format, e.g.: codec.postingsFormat=Memory (Varun Shenoy, David Smiley) + +* LUCENE-5842: When opening large files (where its to expensive to compare + checksum against all the bytes), retrieve checksum to validate structure + of footer, this can detect some forms of corruption such as truncation. + (Robert Muir) API Changes diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java index 70f2b12ce6f..0e1240a06bf 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java @@ -127,6 +127,14 @@ public class BlockTermsReader extends FieldsProducer { // Have PostingsReader init itself postingsReader.init(in); + + if (version >= BlockTermsWriter.VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(in); + } // Read per-field details seekDir(in, dirOffset); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java index adce3e12011..a5f2a1472da 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java @@ -103,6 +103,13 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer { // Have PostingsReader init itself postingsReader.init(in); + + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(in); // Read per-field details seekDir(in, dirOffset); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java index 175f1164e78..562119fc615 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java @@ -117,6 +117,12 @@ class DirectDocValuesProducer extends DocValuesProducer { if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); success = true; } finally { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java index 5fc6295da34..97bfd654c0c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java @@ -136,6 +136,12 @@ class MemoryDocValuesProducer extends DocValuesProducer { if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java index 33fb5faab37..d475e6e6eba 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java @@ -122,6 +122,15 @@ public final class BlockTreeTermsReader extends FieldsProducer { // Have PostingsReader init itself postingsReader.init(in); + + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) { + CodecUtil.retrieveChecksum(in); + } // Read per-field details seekDir(in, dirOffset); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java index a2f5862828c..18649d1b7b1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java @@ -52,7 +52,6 @@ import org.apache.lucene.store.BufferedChecksumIndexInput; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -154,6 +153,14 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader { packedIntsVersion = fieldsStream.readVInt(); decompressor = compressionMode.newDecompressor(); this.bytes = new BytesRef(); + + if (version >= VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(fieldsStream); + } success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java index a56006dc76f..5bfc04a2670 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java @@ -131,6 +131,16 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2); } assert CodecUtil.headerLength(codecNameDat) == vectorsStream.getFilePointer(); + + long pos = vectorsStream.getFilePointer(); + if (version >= VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(vectorsStream); + vectorsStream.seek(pos); + } packedIntsVersion = vectorsStream.readVInt(); chunkSize = vectorsStream.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java index e5bcda30409..68881e8cfe4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java @@ -79,16 +79,40 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { Lucene41PostingsWriter.VERSION_START, Lucene41PostingsWriter.VERSION_CURRENT); forUtil = new ForUtil(docIn); + + if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(docIn); + } if (fieldInfos.hasProx()) { posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION), ioContext); CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version); + + if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(posIn); + } if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) { payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION), ioContext); CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version); + + if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(payIn); + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java index bbfc8a8548c..fffb702401c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java @@ -138,6 +138,14 @@ class Lucene42DocValuesProducer extends DocValuesProducer { if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } + + if (version >= VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); + } success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java index 95f46dc4cd6..89d06324e1e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java @@ -138,6 +138,14 @@ class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable { if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } + + if (version >= Lucene45DocValuesFormat.VERSION_CHECKSUM) { + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); + } success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java index 33713c43590..70b8dd8d4f0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java @@ -118,6 +118,12 @@ class Lucene49DocValuesProducer extends DocValuesProducer implements Closeable { if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java index 4c5d9a411a0..57073f0690c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java @@ -92,6 +92,12 @@ class Lucene49NormsProducer extends DocValuesProducer { if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java b/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java index f215b00aa23..42f7a17e70d 100644 --- a/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java @@ -104,6 +104,14 @@ public final class CompoundFileDirectory extends BaseDirectory { handle = directory.openInput(fileName, context); try { this.entries = readEntries(directory, fileName); + if (version >= CompoundFileWriter.VERSION_CHECKSUM) { + CodecUtil.checkHeader(handle, CompoundFileWriter.DATA_CODEC, version, version); + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(handle); + } success = true; } finally { if (!success) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index 3b8172630c2..558dd49069f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -96,6 +96,12 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { // Have PostingsReader init itself postingsReader.init(in); + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(in); // Read per-field details seekDir(in, dirOffset);