LUCENE-5842: Validate checksum footers for postings lists/docvalues/storedfields/vectors on init

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1612845 13f79535-47bb-0310-9956-ffa450edef68
2014-07-23 14:56:37 +00:00 · 2014-07-23 14:56:37 +00:00 · 26e5273658
parent 430c264afa
commit 26e5273658
15 changed files with 125 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -128,6 +128,11 @@ New Features
 * LUCENE-5825: Benchmark module can use custom postings format, e.g.:
 codec.postingsFormat=Memory (Varun Shenoy, David Smiley)
 * LUCENE-5842: When opening large files (where its to expensive to compare
  checksum against all the bytes), retrieve checksum to validate structure
  of footer, this can detect some forms of corruption such as truncation.
  (Robert Muir)
 API Changes
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java
@ -127,6 +127,14 @@ public class BlockTermsReader extends FieldsProducer {
      // Have PostingsReader init itself
      postingsReader.init(in);
      if (version >= BlockTermsWriter.VERSION_CHECKSUM) {      
        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(in);
      }
      // Read per-field details
      seekDir(in, dirOffset);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java
@ -103,6 +103,13 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer {
      // Have PostingsReader init itself
      postingsReader.init(in);
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(in);
      // Read per-field details
      seekDir(in, dirOffset);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectDocValuesProducer.java
@ -117,6 +117,12 @@ class DirectDocValuesProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(data);
      success = true;
    } finally {
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
@ -136,6 +136,12 @@ class MemoryDocValuesProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(data);
      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@ -122,6 +122,15 @@ public final class BlockTreeTermsReader extends FieldsProducer {
      // Have PostingsReader init itself
      postingsReader.init(in);
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
        CodecUtil.retrieveChecksum(in);
      }
      // Read per-field details
      seekDir(in, dirOffset);
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@ -52,7 +52,6 @@ import org.apache.lucene.store.BufferedChecksumIndexInput;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
@ -154,6 +153,14 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
      packedIntsVersion = fieldsStream.readVInt();
      decompressor = compressionMode.newDecompressor();
      this.bytes = new BytesRef();
      if (version >= VERSION_CHECKSUM) {
        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(fieldsStream);
      }
      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsReader.java
@ -131,6 +131,16 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
        throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2);
      }
      assert CodecUtil.headerLength(codecNameDat) == vectorsStream.getFilePointer();
      long pos = vectorsStream.getFilePointer();
      if (version >= VERSION_CHECKSUM) {
        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(vectorsStream);
        vectorsStream.seek(pos);
      }
      packedIntsVersion = vectorsStream.readVInt();
      chunkSize = vectorsStream.readVInt();
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@ -79,16 +79,40 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
                            Lucene41PostingsWriter.VERSION_START,
                            Lucene41PostingsWriter.VERSION_CURRENT);
      forUtil = new ForUtil(docIn);
      if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(docIn);
      }
      if (fieldInfos.hasProx()) {
        posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION),
                              ioContext);
        CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version);
        if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
          // NOTE: data file is too costly to verify checksum against all the bytes on open,
          // but for now we at least verify proper structure of the checksum footer: which looks
          // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
          // such as file truncation.
          CodecUtil.retrieveChecksum(posIn);
        }
        if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
          payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION),
                                ioContext);
          CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version);
          if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
            // NOTE: data file is too costly to verify checksum against all the bytes on open,
            // but for now we at least verify proper structure of the checksum footer: which looks
            // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
            // such as file truncation.
            CodecUtil.retrieveChecksum(payIn);
          }
        }
      }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
@ -138,6 +138,14 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
      if (version >= VERSION_CHECKSUM) {
        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(data);
      }
      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
@ -138,6 +138,14 @@ class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
      if (version >= Lucene45DocValuesFormat.VERSION_CHECKSUM) {
        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(data);
      }
      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesProducer.java
@ -118,6 +118,12 @@ class Lucene49DocValuesProducer extends DocValuesProducer implements Closeable {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(data);
      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49NormsProducer.java
@ -92,6 +92,12 @@ class Lucene49NormsProducer extends DocValuesProducer {
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch");
      }
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(data);
      success = true;
    } finally {
--- a/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/CompoundFileDirectory.java
@ -104,6 +104,14 @@ public final class CompoundFileDirectory extends BaseDirectory {
      handle = directory.openInput(fileName, context);
      try {
        this.entries = readEntries(directory, fileName);
        if (version >= CompoundFileWriter.VERSION_CHECKSUM) {
          CodecUtil.checkHeader(handle, CompoundFileWriter.DATA_CODEC, version, version);
          // NOTE: data file is too costly to verify checksum against all the bytes on open,
          // but for now we at least verify proper structure of the checksum footer: which looks
          // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
          // such as file truncation.
          CodecUtil.retrieveChecksum(handle);
        }
        success = true;
      } finally {
        if (!success) {
--- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java
@ -96,6 +96,12 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer {
      // Have PostingsReader init itself
      postingsReader.init(in);
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(in);
      // Read per-field details
      seekDir(in, dirOffset);