LUCENE-9260: Verify checksums of CFS files. (#1311)

This commit is contained in:
Adrien Grand 2020-04-15 15:10:59 +02:00 committed by GitHub
parent aa605b3c70
commit 0aa4ba7ccb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 175 additions and 77 deletions

View File

@ -283,6 +283,9 @@ Optimizations
* LUCENE-9237: Faster UniformSplit intersect TermsEnum. (Bruno Roustant)
* LUCENE-9260: LeafReader#checkIntegrity verifies checksums of CFS files.
(Adrien Grand)
* LUCENE-9068: FuzzyQuery builds its Automaton up-front (Alan Woodward, Mike Drob)
* LUCENE-9113: Faster merging of SORTED/SORTED_SET doc values. (Adrien Grand)

View File

@ -24,11 +24,11 @@ import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
@ -37,7 +37,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Lock;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;
@ -55,7 +54,7 @@ public class SimpleTextCompoundFormat extends CompoundFormat {
}
@Override
public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
final IndexInput in = dir.openInput(dataFile, context);
@ -103,7 +102,7 @@ public class SimpleTextCompoundFormat extends CompoundFormat {
endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND));
}
return new Directory() {
return new CompoundDirectory() {
private int getIndex(String name) throws IOException {
int index = Arrays.binarySearch(fileNames, name);
@ -143,28 +142,10 @@ public class SimpleTextCompoundFormat extends CompoundFormat {
return Collections.emptySet();
}
// write methods: disabled
@Override
public IndexOutput createOutput(String name, IOContext context) { throw new UnsupportedOperationException(); }
@Override
public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) { throw new UnsupportedOperationException(); }
@Override
public void sync(Collection<String> names) { throw new UnsupportedOperationException(); }
@Override
public void deleteFile(String name) { throw new UnsupportedOperationException(); }
@Override
public void rename(String source, String dest) { throw new UnsupportedOperationException(); }
@Override
public void syncMetaData() { throw new UnsupportedOperationException(); }
@Override
public Lock obtainLock(String name) { throw new UnsupportedOperationException(); }
public void checkIntegrity() throws IOException {
// No checksums for SimpleText
}
};
}

View File

@ -37,4 +37,9 @@ public class TestSimpleTextCompoundFormat extends BaseCompoundFormatTestCase {
public void testMissingCodecHeadersAreCaught() {
// SimpleText does not catch broken sub-files in CFS!
}
@Override
public void testCheckIntegrity() {
// SimpleText does not catch broken sub-files in CFS!
}
}

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Lock;
/**
* A read-only {@link Directory} that consists of a view over a compound file.
* @see CompoundFormat
* @lucene.experimental
*/
public abstract class CompoundDirectory extends Directory {
/** Sole constructor. */
protected CompoundDirectory() {}
/**
* Checks consistency of this directory.
* <p>
* Note that this may be costly in terms of I/O, e.g.
* may involve computing a checksum value against large data files.
*/
public abstract void checkIntegrity() throws IOException;
/** Not implemented
* @throws UnsupportedOperationException always: not supported by CFS */
@Override
public final void deleteFile(String name) {
throw new UnsupportedOperationException();
}
/** Not implemented
* @throws UnsupportedOperationException always: not supported by CFS */
@Override
public final void rename(String from, String to) {
throw new UnsupportedOperationException();
}
@Override
public final void syncMetaData() {
}
@Override
public final IndexOutput createOutput(String name, IOContext context) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public final IndexOutput createTempOutput(String prefix, String suffix, IOContext context) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public final void sync(Collection<String> names) {
throw new UnsupportedOperationException();
}
@Override
public final Lock obtainLock(String name) {
throw new UnsupportedOperationException();
}
}

View File

@ -40,8 +40,8 @@ public abstract class CompoundFormat {
/**
* Returns a Directory view (read-only) for the compound files in this segment
*/
public abstract Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException;
public abstract CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException;
/**
* Packs the provided segment's files into a compound format. All files referenced
* by the provided {@link SegmentInfo} must have {@link CodecUtil#writeIndexHeader}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.codecs.lucene50;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
@ -66,7 +67,7 @@ public final class Lucene50CompoundFormat extends CompoundFormat {
}
@Override
public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
return new Lucene50CompoundReader(dir, si, context);
}

View File

@ -19,13 +19,13 @@ package org.apache.lucene.codecs.lucene50;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
@ -33,8 +33,6 @@ import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Lock;
import org.apache.lucene.util.IOUtils;
/**
@ -43,7 +41,7 @@ import org.apache.lucene.util.IOUtils;
* Directory methods that would normally modify data throw an exception.
* @lucene.experimental
*/
final class Lucene50CompoundReader extends Directory {
final class Lucene50CompoundReader extends CompoundDirectory {
/** Offset/Length for a slice inside of a compound file */
public static final class FileEntry {
@ -160,24 +158,6 @@ final class Lucene50CompoundReader extends Directory {
return res;
}
/** Not implemented
* @throws UnsupportedOperationException always: not supported by CFS */
@Override
public void deleteFile(String name) {
throw new UnsupportedOperationException();
}
/** Not implemented
* @throws UnsupportedOperationException always: not supported by CFS */
@Override
public void rename(String from, String to) {
throw new UnsupportedOperationException();
}
@Override
public void syncMetaData() {
}
/** Returns the length of a file in the directory.
* @throws IOException if the file does not exist */
@Override
@ -188,26 +168,6 @@ final class Lucene50CompoundReader extends Directory {
throw new FileNotFoundException(name);
return e.length;
}
@Override
public IndexOutput createOutput(String name, IOContext context) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void sync(Collection<String> names) {
throw new UnsupportedOperationException();
}
@Override
public Lock obtainLock(String name) {
throw new UnsupportedOperationException();
}
@Override
public String toString() {
@ -218,4 +178,9 @@ final class Lucene50CompoundReader extends Directory {
public Set<String> getPendingDeletions() {
return Collections.emptySet();
}
@Override
public void checkIntegrity() throws IOException {
CodecUtil.checksumEntireFile(handle);
}
}

View File

@ -28,6 +28,7 @@ import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
@ -60,7 +61,7 @@ final class SegmentCoreReaders {
final StoredFieldsReader fieldsReaderOrig;
final TermVectorsReader termVectorsReaderOrig;
final PointsReader pointsReader;
final Directory cfsReader;
final CompoundDirectory cfsReader;
final String segment;
/**
* fieldinfos for this core: means gen=-1.

View File

@ -366,4 +366,12 @@ public final class SegmentReader extends CodecReader {
public Bits getHardLiveDocs() {
return hardLiveDocs;
}
@Override
public void checkIntegrity() throws IOException {
super.checkIntegrity();
if (core.cfsReader != null) {
core.cfsReader.checkIntegrity();
}
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs.cranky;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
@ -34,7 +35,7 @@ class CrankyCompoundFormat extends CompoundFormat {
}
@Override
public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException {
return delegate.getCompoundReader(dir, si, context);
}

View File

@ -21,13 +21,17 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
import org.apache.lucene.store.FlushInfo;
@ -36,6 +40,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.NRTCachingDirectory;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
@ -821,4 +826,41 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
assertTrue(e.getMessage().contains("checksum failed (hardware problem?)"));
dir.close();
}
public void testCheckIntegrity() throws IOException {
Directory dir = newDirectory();
String subFile = "_123.xyz";
SegmentInfo si = newSegmentInfo(dir, "_123");
try (IndexOutput os = dir.createOutput(subFile, newIOContext(random()))) {
CodecUtil.writeIndexHeader(os, "Foo", 0, si.getId(), "suffix");
for (int i = 0; i < 1024; i++) {
os.writeByte((byte) i);
}
os.writeInt(CodecUtil.FOOTER_MAGIC);
os.writeInt(0);
long checksum = os.getChecksum();
os.writeLong(checksum);
}
si.setFiles(Collections.singletonList(subFile));
FileTrackingDirectoryWrapper writeTrackingDir = new FileTrackingDirectoryWrapper(dir);
si.getCodec().compoundFormat().write(writeTrackingDir, si, IOContext.DEFAULT);
final Set<String> createdFiles = writeTrackingDir.getFiles();
ReadBytesDirectoryWrapper readTrackingDir = new ReadBytesDirectoryWrapper(dir);
CompoundDirectory compoundDir = si.getCodec().compoundFormat().getCompoundReader(readTrackingDir, si, IOContext.READ);
compoundDir.checkIntegrity();
Map<String,FixedBitSet> readBytes = readTrackingDir.getReadBytes();
assertEquals(createdFiles, readBytes.keySet());
for (Map.Entry<String, FixedBitSet> entry : readBytes.entrySet()) {
final String file = entry.getKey();
final FixedBitSet set = entry.getValue().clone();
set.flip(0, set.length());
final int next = set.nextSetBit(0);
assertEquals("Byte at offset " + next + " of " + file + " was not read", DocIdSetIterator.NO_MORE_DOCS, next);
}
compoundDir.close();
dir.close();
}
}

View File

@ -728,15 +728,20 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
return r;
}
private static class FileTrackingDirectoryWrapper extends FilterDirectory {
/**
* A directory that tracks created files that haven't been deleted.
*/
protected static class FileTrackingDirectoryWrapper extends FilterDirectory {
private final Set<String> files = Collections.newSetFromMap(new ConcurrentHashMap<String,Boolean>());
/** Sole constructor. */
FileTrackingDirectoryWrapper(Directory in) {
super(in);
}
Set<String> getFiles() {
/** Get the set of created files. */
public Set<String> getFiles() {
return Set.copyOf(files);
}
@ -820,15 +825,18 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
}
private static class ReadBytesDirectoryWrapper extends FilterDirectory {
/** A directory that tracks read bytes. */
protected static class ReadBytesDirectoryWrapper extends FilterDirectory {
ReadBytesDirectoryWrapper(Directory in) {
/** Sole constructor. */
public ReadBytesDirectoryWrapper(Directory in) {
super(in);
}
private final Map<String, FixedBitSet> readBytes = new ConcurrentHashMap<>();
Map<String, FixedBitSet> getReadBytes() {
/** Get information about which bytes have been read. */
public Map<String, FixedBitSet> getReadBytes() {
return Map.copyOf(readBytes);
}