LUCENE-5842: Validate checksum footers for postings lists/docvalues/storedfields/vectors on init

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1612845 13f79535-47bb-0310-9956-ffa450edef68
2014-07-23 14:56:37 +00:00 · 2014-07-23 14:56:37 +00:00 · 26e5273658
parent 430c264afa
commit 26e5273658
15 changed files with 125 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -128,6 +128,11 @@ New Features

 * LUCENE-5825: Benchmark module can use custom postings format, e.g.:
 codec.postingsFormat=Memory (Varun Shenoy, David Smiley)
+
+* LUCENE-5842: When opening large files (where its to expensive to compare
+  checksum against all the bytes), retrieve checksum to validate structure
+  of footer, this can detect some forms of corruption such as truncation.
+  (Robert Muir)
  
 API Changes

--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
@ -127,6 +127,14 @@ public class BlockTermsReader extends FieldsProducer {

      // Have PostingsReader init itself
      postingsReader.init(in);
+      
+      if (version >= BlockTermsWriter.VERSION_CHECKSUM) {      
+        // NOTE: data file is too costly to verify checksum against all the bytes on open,
+        // but for now we at least verify proper structure of the checksum footer: which looks
+        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+        // such as file truncation.
+        CodecUtil.retrieveChecksum(in);
+      }

      // Read per-field details
      seekDir(in, dirOffset);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java
@ -103,6 +103,13 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer {

      // Have PostingsReader init itself
      postingsReader.init(in);
+      
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      CodecUtil.retrieveChecksum(in);

      // Read per-field details
      seekDir(in, dirOffset);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java
@ -117,6 +117,12 @@ class DirectDocValuesProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      CodecUtil.retrieveChecksum(data);

      success = true;
    } finally {
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
@ -136,6 +136,12 @@ class MemoryDocValuesProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      CodecUtil.retrieveChecksum(data);

      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@ -122,6 +122,15 @@ public final class BlockTreeTermsReader extends FieldsProducer {

      // Have PostingsReader init itself
      postingsReader.init(in);
+      
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
+        CodecUtil.retrieveChecksum(in);
+      }

      // Read per-field details
      seekDir(in, dirOffset);
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@ -52,7 +52,6 @@ import org.apache.lucene.store.BufferedChecksumIndexInput;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
@ -154,6 +153,14 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
      packedIntsVersion = fieldsStream.readVInt();
      decompressor = compressionMode.newDecompressor();
      this.bytes = new BytesRef();
+      
+      if (version >= VERSION_CHECKSUM) {
+        // NOTE: data file is too costly to verify checksum against all the bytes on open,
+        // but for now we at least verify proper structure of the checksum footer: which looks
+        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+        // such as file truncation.
+        CodecUtil.retrieveChecksum(fieldsStream);
+      }

      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
@ -131,6 +131,16 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
        throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2);
      }
      assert CodecUtil.headerLength(codecNameDat) == vectorsStream.getFilePointer();
+      
+      long pos = vectorsStream.getFilePointer();
+      if (version >= VERSION_CHECKSUM) {
+        // NOTE: data file is too costly to verify checksum against all the bytes on open,
+        // but for now we at least verify proper structure of the checksum footer: which looks
+        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+        // such as file truncation.
+        CodecUtil.retrieveChecksum(vectorsStream);
+        vectorsStream.seek(pos);
+      }

      packedIntsVersion = vectorsStream.readVInt();
      chunkSize = vectorsStream.readVInt();
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@ -79,16 +79,40 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
                            Lucene41PostingsWriter.VERSION_START,
                            Lucene41PostingsWriter.VERSION_CURRENT);
      forUtil = new ForUtil(docIn);
+      
+      if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
+        // NOTE: data file is too costly to verify checksum against all the bytes on open,
+        // but for now we at least verify proper structure of the checksum footer: which looks
+        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+        // such as file truncation.
+        CodecUtil.retrieveChecksum(docIn);
+      }

      if (fieldInfos.hasProx()) {
        posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION),
                              ioContext);
        CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version);
+        
+        if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
+          // NOTE: data file is too costly to verify checksum against all the bytes on open,
+          // but for now we at least verify proper structure of the checksum footer: which looks
+          // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+          // such as file truncation.
+          CodecUtil.retrieveChecksum(posIn);
+        }

        if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
          payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION),
                                ioContext);
          CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version);
+          
+          if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
+            // NOTE: data file is too costly to verify checksum against all the bytes on open,
+            // but for now we at least verify proper structure of the checksum footer: which looks
+            // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+            // such as file truncation.
+            CodecUtil.retrieveChecksum(payIn);
+          }
        }
      }

--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
@ -138,6 +138,14 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
+      
+      if (version >= VERSION_CHECKSUM) {
+        // NOTE: data file is too costly to verify checksum against all the bytes on open,
+        // but for now we at least verify proper structure of the checksum footer: which looks
+        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+        // such as file truncation.
+        CodecUtil.retrieveChecksum(data);
+      }

      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
@ -138,6 +138,14 @@ class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
+      
+      if (version >= Lucene45DocValuesFormat.VERSION_CHECKSUM) {
+        // NOTE: data file is too costly to verify checksum against all the bytes on open,
+        // but for now we at least verify proper structure of the checksum footer: which looks
+        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+        // such as file truncation.
+        CodecUtil.retrieveChecksum(data);
+      }

      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java
@ -118,6 +118,12 @@ class Lucene49DocValuesProducer extends DocValuesProducer implements Closeable {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      CodecUtil.retrieveChecksum(data);

      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java
@ -92,6 +92,12 @@ class Lucene49NormsProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      CodecUtil.retrieveChecksum(data);

      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java
@ -104,6 +104,14 @@ public final class CompoundFileDirectory extends BaseDirectory {
      handle = directory.openInput(fileName, context);
      try {
        this.entries = readEntries(directory, fileName);
+        if (version >= CompoundFileWriter.VERSION_CHECKSUM) {
+          CodecUtil.checkHeader(handle, CompoundFileWriter.DATA_CODEC, version, version);
+          // NOTE: data file is too costly to verify checksum against all the bytes on open,
+          // but for now we at least verify proper structure of the checksum footer: which looks
+          // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+          // such as file truncation.
+          CodecUtil.retrieveChecksum(handle);
+        }
        success = true;
      } finally {
        if (!success) {
--- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
@ -96,6 +96,12 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer {

      // Have PostingsReader init itself
      postingsReader.init(in);
+      
+      // NOTE: data file is too costly to verify checksum against all the bytes on open,
+      // but for now we at least verify proper structure of the checksum footer: which looks
+      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
+      // such as file truncation.
+      CodecUtil.retrieveChecksum(in);

      // Read per-field details
      seekDir(in, dirOffset);