LUCENE-5842: Validate checksum footers for postings lists/docvalues/storedfields/vectors on init

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1612845 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-07-23 14:56:37 +00:00
parent 430c264afa
commit 26e5273658
15 changed files with 125 additions and 1 deletions

View File

@ -129,6 +129,11 @@ New Features
* LUCENE-5825: Benchmark module can use custom postings format, e.g.: * LUCENE-5825: Benchmark module can use custom postings format, e.g.:
codec.postingsFormat=Memory (Varun Shenoy, David Smiley) codec.postingsFormat=Memory (Varun Shenoy, David Smiley)
* LUCENE-5842: When opening large files (where its to expensive to compare
checksum against all the bytes), retrieve checksum to validate structure
of footer, this can detect some forms of corruption such as truncation.
(Robert Muir)
API Changes API Changes
* LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless) * LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless)

View File

@ -128,6 +128,14 @@ public class BlockTermsReader extends FieldsProducer {
// Have PostingsReader init itself // Have PostingsReader init itself
postingsReader.init(in); postingsReader.init(in);
if (version >= BlockTermsWriter.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(in);
}
// Read per-field details // Read per-field details
seekDir(in, dirOffset); seekDir(in, dirOffset);

View File

@ -104,6 +104,13 @@ public final class OrdsBlockTreeTermsReader extends FieldsProducer {
// Have PostingsReader init itself // Have PostingsReader init itself
postingsReader.init(in); postingsReader.init(in);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(in);
// Read per-field details // Read per-field details
seekDir(in, dirOffset); seekDir(in, dirOffset);
seekDir(indexIn, indexDirOffset); seekDir(indexIn, indexDirOffset);

View File

@ -118,6 +118,12 @@ class DirectDocValuesProducer extends DocValuesProducer {
throw new CorruptIndexException("Format versions mismatch"); throw new CorruptIndexException("Format versions mismatch");
} }
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -137,6 +137,12 @@ class MemoryDocValuesProducer extends DocValuesProducer {
throw new CorruptIndexException("Format versions mismatch"); throw new CorruptIndexException("Format versions mismatch");
} }
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -123,6 +123,15 @@ public final class BlockTreeTermsReader extends FieldsProducer {
// Have PostingsReader init itself // Have PostingsReader init itself
postingsReader.init(in); postingsReader.init(in);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
if (version >= BlockTreeTermsWriter.VERSION_CHECKSUM) {
CodecUtil.retrieveChecksum(in);
}
// Read per-field details // Read per-field details
seekDir(in, dirOffset); seekDir(in, dirOffset);
seekDir(indexIn, indexDirOffset); seekDir(indexIn, indexDirOffset);

View File

@ -52,7 +52,6 @@ import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
@ -155,6 +154,14 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
decompressor = compressionMode.newDecompressor(); decompressor = compressionMode.newDecompressor();
this.bytes = new BytesRef(); this.bytes = new BytesRef();
if (version >= VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(fieldsStream);
}
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -132,6 +132,16 @@ public final class CompressingTermVectorsReader extends TermVectorsReader implem
} }
assert CodecUtil.headerLength(codecNameDat) == vectorsStream.getFilePointer(); assert CodecUtil.headerLength(codecNameDat) == vectorsStream.getFilePointer();
long pos = vectorsStream.getFilePointer();
if (version >= VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(vectorsStream);
vectorsStream.seek(pos);
}
packedIntsVersion = vectorsStream.readVInt(); packedIntsVersion = vectorsStream.readVInt();
chunkSize = vectorsStream.readVInt(); chunkSize = vectorsStream.readVInt();
decompressor = compressionMode.newDecompressor(); decompressor = compressionMode.newDecompressor();

View File

@ -80,15 +80,39 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
Lucene41PostingsWriter.VERSION_CURRENT); Lucene41PostingsWriter.VERSION_CURRENT);
forUtil = new ForUtil(docIn); forUtil = new ForUtil(docIn);
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(docIn);
}
if (fieldInfos.hasProx()) { if (fieldInfos.hasProx()) {
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION), posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION),
ioContext); ioContext);
CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version); CodecUtil.checkHeader(posIn, Lucene41PostingsWriter.POS_CODEC, version, version);
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(posIn);
}
if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) { if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION), payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION),
ioContext); ioContext);
CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version); CodecUtil.checkHeader(payIn, Lucene41PostingsWriter.PAY_CODEC, version, version);
if (version >= Lucene41PostingsWriter.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(payIn);
}
} }
} }

View File

@ -139,6 +139,14 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
throw new CorruptIndexException("Format versions mismatch"); throw new CorruptIndexException("Format versions mismatch");
} }
if (version >= VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
}
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -139,6 +139,14 @@ class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable {
throw new CorruptIndexException("Format versions mismatch"); throw new CorruptIndexException("Format versions mismatch");
} }
if (version >= Lucene45DocValuesFormat.VERSION_CHECKSUM) {
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
}
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -119,6 +119,12 @@ class Lucene49DocValuesProducer extends DocValuesProducer implements Closeable {
throw new CorruptIndexException("Format versions mismatch"); throw new CorruptIndexException("Format versions mismatch");
} }
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -93,6 +93,12 @@ class Lucene49NormsProducer extends DocValuesProducer {
throw new CorruptIndexException("Format versions mismatch"); throw new CorruptIndexException("Format versions mismatch");
} }
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -104,6 +104,14 @@ public final class CompoundFileDirectory extends BaseDirectory {
handle = directory.openInput(fileName, context); handle = directory.openInput(fileName, context);
try { try {
this.entries = readEntries(directory, fileName); this.entries = readEntries(directory, fileName);
if (version >= CompoundFileWriter.VERSION_CHECKSUM) {
CodecUtil.checkHeader(handle, CompoundFileWriter.DATA_CODEC, version, version);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(handle);
}
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {

View File

@ -97,6 +97,12 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer {
// Have PostingsReader init itself // Have PostingsReader init itself
postingsReader.init(in); postingsReader.init(in);
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(in);
// Read per-field details // Read per-field details
seekDir(in, dirOffset); seekDir(in, dirOffset);
seekDir(indexIn, indexDirOffset); seekDir(indexIn, indexDirOffset);