LUCENE-7323: compound file writing now verifies checksum and segment ID for the incoming sub-files, to catch hardware issues or filesystem bugs earlier

This commit is contained in:
Mike McCandless 2016-06-09 14:48:58 -04:00
parent 7e86ba8c73
commit 067fb25e43
19 changed files with 231 additions and 101 deletions

View File

@ -12,6 +12,12 @@ Bug Fixes
* LUCENE-6662: Fixed potential resource leaks. (Rishabh Patel via Adrien Grand)
Improvements
* LUCENE-7323: Compound file writing now verifies the incoming
sub-files' checkums and segment IDs, to catch hardware issues or
filesytem bugs earlier (Robert Muir, Mike McCandless)
Other
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien

View File

@ -122,7 +122,7 @@ import org.apache.lucene.index.SegmentWriteState;
* and saving the offset/etc for each field.
* @lucene.experimental
*/
public class SimpleTextDocValuesFormat extends DocValuesFormat {
class SimpleTextDocValuesFormat extends DocValuesFormat {
public SimpleTextDocValuesFormat() {
super("SimpleText");

View File

@ -34,7 +34,7 @@ import org.apache.lucene.index.SegmentWriteState;
* any text editor, and even edit it to alter your index.
*
* @lucene.experimental */
public final class SimpleTextPostingsFormat extends PostingsFormat {
final class SimpleTextPostingsFormat extends PostingsFormat {
public SimpleTextPostingsFormat() {
super("SimpleText");

View File

@ -15,4 +15,3 @@
org.apache.lucene.codecs.memory.MemoryDocValuesFormat
org.apache.lucene.codecs.memory.DirectDocValuesFormat
org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat

View File

@ -19,5 +19,4 @@ org.apache.lucene.codecs.memory.DirectPostingsFormat
org.apache.lucene.codecs.memory.FSTOrdPostingsFormat
org.apache.lucene.codecs.memory.FSTPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.autoprefix.AutoPrefixPostingsFormat

View File

@ -27,4 +27,14 @@ public class TestSimpleTextCompoundFormat extends BaseCompoundFormatTestCase {
protected Codec getCodec() {
return codec;
}
@Override
public void testCorruptFilesAreCaught() {
// SimpleText does not catch broken sub-files in CFS!
}
@Override
public void testMissingCodecHeadersAreCaught() {
// SimpleText does not catch broken sub-files in CFS!
}
}

View File

@ -258,6 +258,57 @@ public final class CodecUtil {
return version;
}
/**
* Expert: verifies the incoming {@link IndexInput} has an index header
* and that its segment ID matches the expected one, and then copies
* that index header into the provided {@link DataOutput}. This is
* useful when building compound files.
*
* @param in Input stream, positioned at the point where the
* index header was previously written. Typically this is located
* at the beginning of the file.
* @param out Output stream, where the header will be copied to.
* @param expectedID Expected segment ID
* @throws CorruptIndexException If the first four bytes are not
* {@link #CODEC_MAGIC}, or if the <code>expectedID</code>
* does not match.
* @throws IOException If there is an I/O error reading from the underlying medium.
*
* @lucene.internal
*/
public static void verifyAndCopyIndexHeader(IndexInput in, DataOutput out, byte[] expectedID) throws IOException {
// make sure it's large enough to have a header and footer
if (in.length() < footerLength() + headerLength("")) {
throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: file is too small (" + in.length() + " bytes)", in);
}
int actualHeader = in.readInt();
if (actualHeader != CODEC_MAGIC) {
throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CodecUtil.CODEC_MAGIC, in);
}
// we can't verify these, so we pass-through:
String codec = in.readString();
int version = in.readInt();
// verify id:
checkIndexHeaderID(in, expectedID);
// we can't verify extension either, so we pass-through:
int suffixLength = in.readByte() & 0xFF;
byte[] suffixBytes = new byte[suffixLength];
in.readBytes(suffixBytes, 0, suffixLength);
// now write the header we just verified
out.writeInt(CodecUtil.CODEC_MAGIC);
out.writeString(codec);
out.writeInt(version);
out.writeBytes(expectedID, 0, expectedID.length);
out.writeByte((byte) suffixLength);
out.writeBytes(suffixBytes, 0, suffixLength);
}
/** Retrieves the full index header from the provided {@link IndexInput}.
* This throws {@link CorruptIndexException} if this file does
* not appear to be an index file. */
@ -474,7 +525,7 @@ public final class CodecUtil {
* @throws CorruptIndexException if CRC is formatted incorrectly (wrong bits set)
* @throws IOException if an i/o error occurs
*/
public static long readCRC(IndexInput input) throws IOException {
static long readCRC(IndexInput input) throws IOException {
long value = input.readLong();
if ((value & 0xFFFFFFFF00000000L) != 0) {
throw new CorruptIndexException("Illegal CRC-32 checksum: " + value, input);
@ -487,7 +538,7 @@ public final class CodecUtil {
* @throws IllegalStateException if CRC is formatted incorrectly (wrong bits set)
* @throws IOException if an i/o error occurs
*/
public static void writeCRC(IndexOutput output) throws IOException {
static void writeCRC(IndexOutput output) throws IOException {
long value = output.getChecksum();
if ((value & 0xFFFFFFFF00000000L) != 0) {
throw new IllegalStateException("Illegal CRC-32 checksum: " + value + " (resource=" + output + ")");

View File

@ -43,7 +43,9 @@ public abstract class CompoundFormat {
public abstract Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException;
/**
* Packs the provided segment's files into a compound format.
* Packs the provided segment's files into a compound format. All files referenced
* by the provided {@link SegmentInfo} must have {@link CodecUtil#writeIndexHeader}
* and {@link CodecUtil#writeFooter}.
*/
public abstract void write(Directory dir, SegmentInfo si, IOContext context) throws IOException;
}

View File

@ -18,17 +18,17 @@ package org.apache.lucene.codecs.lucene50;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.StringHelper;
/**
* Lucene 5.0 compound file format
@ -76,6 +76,9 @@ public final class Lucene50CompoundFormat extends CompoundFormat {
String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
String entriesFile = IndexFileNames.segmentFileName(si.name, "", ENTRIES_EXTENSION);
byte[] expectedID = si.getId();
byte[] id = new byte[StringHelper.ID_LENGTH];
try (IndexOutput data = dir.createOutput(dataFile, context);
IndexOutput entries = dir.createOutput(entriesFile, context)) {
CodecUtil.writeIndexHeader(data, DATA_CODEC, VERSION_CURRENT, si.getId(), "");
@ -87,8 +90,23 @@ public final class Lucene50CompoundFormat extends CompoundFormat {
// write bytes for file
long startOffset = data.getFilePointer();
try (IndexInput in = dir.openInput(file, IOContext.READONCE)) {
data.copyBytes(in, in.length());
try (ChecksumIndexInput in = dir.openChecksumInput(file, IOContext.READONCE)) {
// just copies the index header, verifying that its id matches what we expect
CodecUtil.verifyAndCopyIndexHeader(in, data, si.getId());
// copy all bytes except the footer
long numBytesToCopy = in.length() - CodecUtil.footerLength() - in.getFilePointer();
data.copyBytes(in, numBytesToCopy);
// verify footer (checksum) matches for the incoming file we are copying
long checksum = CodecUtil.checkFooter(in);
// this is poached from CodecUtil.writeFooter, but we need to use our own checksum, not data.getChecksum(), but I think
// adding a public method to CodecUtil to do that is somewhat dangerous:
data.writeInt(CodecUtil.FOOTER_MAGIC);
data.writeInt(0);
data.writeLong(checksum);
}
long endOffset = data.getFilePointer();

View File

@ -100,7 +100,7 @@ final class Lucene50CompoundReader extends Directory {
}
/** Helper method that reads CFS entries from an input stream */
private final Map<String, FileEntry> readEntries(byte[] segmentID, Directory dir, String entriesFileName) throws IOException {
private Map<String, FileEntry> readEntries(byte[] segmentID, Directory dir, String entriesFileName) throws IOException {
Map<String,FileEntry> mapping = null;
try (ChecksumIndexInput entriesStream = dir.openChecksumInput(entriesFileName, IOContext.READONCE)) {
Throwable priorE = null;
@ -140,7 +140,8 @@ final class Lucene50CompoundReader extends Directory {
final String id = IndexFileNames.stripSegmentName(name);
final FileEntry entry = entries.get(id);
if (entry == null) {
throw new FileNotFoundException("No sub-file with id " + id + " found (fileName=" + name + " files: " + entries.keySet() + ")");
String datFileName = IndexFileNames.segmentFileName(segmentName, "", Lucene50CompoundFormat.DATA_EXTENSION);
throw new FileNotFoundException("No sub-file with id " + id + " found in compound file \"" + datFileName + "\" (fileName=" + name + " files: " + entries.keySet() + ")");
}
return handle.slice(name, entry.offset, entry.length);
}

View File

@ -14,13 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
package org.apache.lucene.codecs;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;

View File

@ -79,7 +79,7 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase {
// we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
final DocValuesFormat fast = TestUtil.getDefaultDocValuesFormat();
final DocValuesFormat slow = DocValuesFormat.forName("SimpleText");
final DocValuesFormat slow = DocValuesFormat.forName("Memory");
iwc.setCodec(new AssertingCodec() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {

View File

@ -24,8 +24,8 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -33,8 +33,8 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
@ -202,13 +202,13 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase {
public static class MockCodec extends AssertingCodec {
final PostingsFormat luceneDefault = TestUtil.getDefaultPostingsFormat();
final PostingsFormat simpleText = new SimpleTextPostingsFormat();
final PostingsFormat direct = new DirectPostingsFormat();
final PostingsFormat memory = new MemoryPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (field.equals("id")) {
return simpleText;
return direct;
} else if (field.equals("content")) {
return memory;
} else {
@ -219,12 +219,12 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase {
public static class MockCodec2 extends AssertingCodec {
final PostingsFormat luceneDefault = TestUtil.getDefaultPostingsFormat();
final PostingsFormat simpleText = new SimpleTextPostingsFormat();
final PostingsFormat direct = new DirectPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (field.equals("id")) {
return simpleText;
return direct;
} else {
return luceneDefault;
}

View File

@ -1086,14 +1086,14 @@ public class TestAddIndexes extends LuceneTestCase {
}
private static final class CustomPerFieldCodec extends AssertingCodec {
private final PostingsFormat simpleTextFormat = PostingsFormat.forName("SimpleText");
private final PostingsFormat directFormat = PostingsFormat.forName("Direct");
private final PostingsFormat defaultFormat = TestUtil.getDefaultPostingsFormat();
private final PostingsFormat memoryFormat = PostingsFormat.forName("Memory");
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (field.equals("id")) {
return simpleTextFormat;
return directFormat;
} else if (field.equals("content")) {
return memoryFormat;
} else {

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs.mockrandom;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
@ -47,6 +48,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
@ -107,11 +109,10 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
}
final String seedFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SEED_EXT);
final IndexOutput out = state.directory.createOutput(seedFileName, state.context);
try {
try(IndexOutput out = state.directory.createOutput(seedFileName, state.context)) {
CodecUtil.writeIndexHeader(out, "MockRandomSeed", 0, state.segmentInfo.getId(), state.segmentSuffix);
out.writeLong(seed);
} finally {
out.close();
CodecUtil.writeFooter(out);
}
final Random random = new Random(seed);
@ -267,8 +268,10 @@ public final class MockRandomPostingsFormat extends PostingsFormat {
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
final String seedFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SEED_EXT);
final IndexInput in = state.directory.openInput(seedFileName, state.context);
final ChecksumIndexInput in = state.directory.openChecksumInput(seedFileName, state.context);
CodecUtil.checkIndexHeader(in, "MockRandomSeed", 0, 0, state.segmentInfo.getId(), state.segmentSuffix);
final long seed = in.readLong();
CodecUtil.checkFooter(in);
if (LuceneTestCase.VERBOSE) {
System.out.println("MockRandomCodec: reading from seg=" + state.segmentInfo.name + " formatID=" + state.segmentSuffix + " seed=" + seed);
}

View File

@ -25,6 +25,7 @@ import java.util.List;
import java.util.Random;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
@ -72,9 +73,9 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
for (int i=0; i<data.length; i++) {
String testfile = "_" + i + ".test";
Directory dir = newDirectory();
createSequenceFile(dir, testfile, (byte) 0, data[i]);
SegmentInfo si = newSegmentInfo(dir, "_" + i);
createSequenceFile(dir, testfile, (byte) 0, data[i], si.getId(), "suffix");
si.setFiles(Collections.singleton(testfile));
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -96,10 +97,10 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
public void testTwoFiles() throws IOException {
String files[] = { "_123.d1", "_123.d2" };
Directory dir = newDirectory();
createSequenceFile(dir, files[0], (byte) 0, 15);
createSequenceFile(dir, files[1], (byte) 0, 114);
SegmentInfo si = newSegmentInfo(dir, "_123");
createSequenceFile(dir, files[0], (byte) 0, 15, si.getId(), "suffix");
createSequenceFile(dir, files[1], (byte) 0, 114, si.getId(), "suffix");
si.setFiles(Arrays.asList(files));
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -122,11 +123,13 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
final String testfile = "_123.test";
Directory dir = newDirectory();
IndexOutput out = dir.createOutput(testfile, IOContext.DEFAULT);
out.writeInt(3);
out.close();
SegmentInfo si = newSegmentInfo(dir, "_123");
try (IndexOutput out = dir.createOutput(testfile, IOContext.DEFAULT)) {
CodecUtil.writeIndexHeader(out, "Foo", 0, si.getId(), "suffix");
out.writeInt(3);
CodecUtil.writeFooter(out);
}
si.setFiles(Collections.singleton(testfile));
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -148,11 +151,13 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
return super.createOutput(name, context);
}
};
IndexOutput out = dir.createOutput(testfile, myContext);
out.writeInt(3);
out.close();
SegmentInfo si = newSegmentInfo(dir, "_123");
try (IndexOutput out = dir.createOutput(testfile, myContext)) {
CodecUtil.writeIndexHeader(out, "Foo", 0, si.getId(), "suffix");
out.writeInt(3);
CodecUtil.writeFooter(out);
}
si.setFiles(Collections.singleton(testfile));
si.getCodec().compoundFormat().write(dir, si, myContext);
dir.close();
@ -165,14 +170,16 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
Directory dir = new NRTCachingDirectory(newFSDirectory(createTempDir()), 2.0, 25.0);
IndexOutput out = dir.createOutput(testfile, context);
byte[] bytes = new byte[512];
for(int i=0;i<1024*1024;i++) {
out.writeBytes(bytes, 0, bytes.length);
}
out.close();
SegmentInfo si = newSegmentInfo(dir, "_123");
try (IndexOutput out = dir.createOutput(testfile, context)) {
CodecUtil.writeIndexHeader(out, "Foo", 0, si.getId(), "suffix");
byte[] bytes = new byte[512];
for(int i=0;i<1024*1024;i++) {
out.writeBytes(bytes, 0, bytes.length);
}
CodecUtil.writeFooter(out);
}
si.setFiles(Collections.singleton(testfile));
si.getCodec().compoundFormat().write(dir, si, context);
@ -326,17 +333,19 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
// Setup the test segment
String segment = "_123";
int chunk = 1024; // internal buffer size used by the stream
createRandomFile(dir, segment + ".zero", 0);
createRandomFile(dir, segment + ".one", 1);
createRandomFile(dir, segment + ".ten", 10);
createRandomFile(dir, segment + ".hundred", 100);
createRandomFile(dir, segment + ".big1", chunk);
createRandomFile(dir, segment + ".big2", chunk - 1);
createRandomFile(dir, segment + ".big3", chunk + 1);
createRandomFile(dir, segment + ".big4", 3 * chunk);
createRandomFile(dir, segment + ".big5", 3 * chunk - 1);
createRandomFile(dir, segment + ".big6", 3 * chunk + 1);
createRandomFile(dir, segment + ".big7", 1000 * chunk);
SegmentInfo si = newSegmentInfo(dir, "_123");
byte[] segId = si.getId();
createRandomFile(dir, segment + ".zero", 0, segId);
createRandomFile(dir, segment + ".one", 1, segId);
createRandomFile(dir, segment + ".ten", 10, segId);
createRandomFile(dir, segment + ".hundred", 100, segId);
createRandomFile(dir, segment + ".big1", chunk, segId);
createRandomFile(dir, segment + ".big2", chunk - 1, segId);
createRandomFile(dir, segment + ".big3", chunk + 1, segId);
createRandomFile(dir, segment + ".big4", 3 * chunk, segId);
createRandomFile(dir, segment + ".big5", 3 * chunk - 1, segId);
createRandomFile(dir, segment + ".big6", 3 * chunk + 1, segId);
createRandomFile(dir, segment + ".big7", 1000 * chunk, segId);
List<String> files = new ArrayList<>();
for (String file : dir.listAll()) {
@ -345,7 +354,6 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
}
}
SegmentInfo si = newSegmentInfo(dir, "_123");
si.setFiles(files);
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -370,17 +378,19 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
final int FILE_COUNT = atLeast(500);
List<String> files = new ArrayList<>();
SegmentInfo si = newSegmentInfo(dir, "_123");
for (int fileIdx = 0; fileIdx < FILE_COUNT; fileIdx++) {
String file = "_123." + fileIdx;
files.add(file);
IndexOutput out = dir.createOutput(file, newIOContext(random()));
out.writeByte((byte) fileIdx);
out.close();
try (IndexOutput out = dir.createOutput(file, newIOContext(random()))) {
CodecUtil.writeIndexHeader(out, "Foo", 0, si.getId(), "suffix");
out.writeByte((byte) fileIdx);
CodecUtil.writeFooter(out);
}
}
assertEquals(0, dir.getFileHandleCount());
SegmentInfo si = newSegmentInfo(dir, "_123");
si.setFiles(files);
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -388,6 +398,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
final IndexInput[] ins = new IndexInput[FILE_COUNT];
for (int fileIdx = 0; fileIdx < FILE_COUNT; fileIdx++) {
ins[fileIdx] = cfs.openInput("_123." + fileIdx, newIOContext(random()));
CodecUtil.checkIndexHeader(ins[fileIdx], "Foo", 0, 0, si.getId(), "suffix");
}
assertEquals(1, dir.getFileHandleCount());
@ -631,27 +642,31 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
}
/** Creates a file of the specified size with random data. */
protected static void createRandomFile(Directory dir, String name, int size) throws IOException {
IndexOutput os = dir.createOutput(name, newIOContext(random()));
protected static void createRandomFile(Directory dir, String name, int size, byte[] segId) throws IOException {
Random rnd = random();
for (int i=0; i<size; i++) {
byte b = (byte) rnd.nextInt(256);
os.writeByte(b);
try (IndexOutput os = dir.createOutput(name, newIOContext(random()))) {
CodecUtil.writeIndexHeader(os, "Foo", 0, segId, "suffix");
for (int i=0; i<size; i++) {
byte b = (byte) rnd.nextInt(256);
os.writeByte(b);
}
CodecUtil.writeFooter(os);
}
os.close();
}
/** Creates a file of the specified size with sequential data. The first
* byte is written as the start byte provided. All subsequent bytes are
* computed as start + offset where offset is the number of the byte.
*/
protected static void createSequenceFile(Directory dir, String name, byte start, int size) throws IOException {
IndexOutput os = dir.createOutput(name, newIOContext(random()));
for (int i=0; i < size; i++) {
os.writeByte(start);
start ++;
protected static void createSequenceFile(Directory dir, String name, byte start, int size, byte[] segID, String segSuffix) throws IOException {
try (IndexOutput os = dir.createOutput(name, newIOContext(random()))) {
CodecUtil.writeIndexHeader(os, "Foo", 0, segID, segSuffix);
for (int i=0; i < size; i++) {
os.writeByte(start);
start ++;
}
CodecUtil.writeFooter(os);
}
os.close();
}
protected static void assertSameStreams(String msg, IndexInput expected, IndexInput test) throws IOException {
@ -724,12 +739,12 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
*/
protected static Directory createLargeCFS(Directory dir) throws IOException {
List<String> files = new ArrayList<>();
SegmentInfo si = newSegmentInfo(dir, "_123");
for (int i = 0; i < 20; i++) {
createSequenceFile(dir, "_123.f" + i, (byte) 0, 2000);
createSequenceFile(dir, "_123.f" + i, (byte) 0, 2000, si.getId(), "suffix");
files.add("_123.f" + i);
}
SegmentInfo si = newSegmentInfo(dir, "_123");
si.setFiles(files);
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -750,9 +765,9 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
public void testResourceNameInsideCompoundFile() throws Exception {
Directory dir = newDirectory();
String subFile = "_123.xyz";
createSequenceFile(dir, subFile, (byte) 0, 10);
SegmentInfo si = newSegmentInfo(dir, "_123");
createSequenceFile(dir, subFile, (byte) 0, 10, si.getId(), "suffix");
si.setFiles(Collections.singletonList(subFile));
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT);
@ -762,4 +777,48 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
cfs.close();
dir.close();
}
public void testMissingCodecHeadersAreCaught() throws Exception {
Directory dir = newDirectory();
String subFile = "_123.xyz";
// missing codec header
try (IndexOutput os = dir.createOutput(subFile, newIOContext(random()))) {
for (int i=0; i < 1024; i++) {
os.writeByte((byte) i);
}
}
SegmentInfo si = newSegmentInfo(dir, "_123");
si.setFiles(Collections.singletonList(subFile));
Exception e = expectThrows(CorruptIndexException.class, () -> si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT));
assertTrue(e.getMessage().contains("codec header mismatch"));
dir.close();
}
public void testCorruptFilesAreCaught() throws Exception {
Directory dir = newDirectory();
String subFile = "_123.xyz";
// wrong checksum
SegmentInfo si = newSegmentInfo(dir, "_123");
try (IndexOutput os = dir.createOutput(subFile, newIOContext(random()))) {
CodecUtil.writeIndexHeader(os, "Foo", 0, si.getId(), "suffix");
for (int i=0; i < 1024; i++) {
os.writeByte((byte) i);
}
// write footer w/ wrong checksum
os.writeInt(CodecUtil.FOOTER_MAGIC);
os.writeInt(0);
long checksum = os.getChecksum();
os.writeLong(checksum+1);
}
si.setFiles(Collections.singletonList(subFile));
Exception e = expectThrows(CorruptIndexException.class, () -> si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT));
assertTrue(e.getMessage().contains("checksum failed (hardware problem?)"));
dir.close();
}
}

View File

@ -22,7 +22,6 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;
@ -50,8 +49,6 @@ import org.apache.lucene.codecs.memory.FSTPostingsFormat;
import org.apache.lucene.codecs.memory.MemoryDocValuesFormat;
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -159,10 +156,6 @@ public class RandomCodec extends AssertingCodec {
PostingsFormat codec = previousMappings.get(name);
if (codec == null) {
codec = formats.get(Math.abs(perFieldSeed ^ name.hashCode()) % formats.size());
if (codec instanceof SimpleTextPostingsFormat && perFieldSeed % 5 != 0) {
// make simpletext rarer, choose again
codec = formats.get(Math.abs(perFieldSeed ^ name.toUpperCase(Locale.ROOT).hashCode()) % formats.size());
}
previousMappings.put(name, codec);
// Safety:
assert previousMappings.size() < 10000: "test went insane";
@ -175,10 +168,6 @@ public class RandomCodec extends AssertingCodec {
DocValuesFormat codec = previousDVMappings.get(name);
if (codec == null) {
codec = dvFormats.get(Math.abs(perFieldSeed ^ name.hashCode()) % dvFormats.size());
if (codec instanceof SimpleTextDocValuesFormat && perFieldSeed % 5 != 0) {
// make simpletext rarer, choose again
codec = dvFormats.get(Math.abs(perFieldSeed ^ name.toUpperCase(Locale.ROOT).hashCode()) % dvFormats.size());
}
previousDVMappings.put(name, codec);
// Safety:
assert previousDVMappings.size() < 10000: "test went insane";
@ -214,7 +203,7 @@ public class RandomCodec extends AssertingCodec {
new LuceneFixedGap(TestUtil.nextInt(random, 1, 1000)),
new LuceneVarGapFixedInterval(TestUtil.nextInt(random, 1, 1000)),
new LuceneVarGapDocFreqInterval(TestUtil.nextInt(random, 1, 100), TestUtil.nextInt(random, 1, 1000)),
random.nextInt(10) == 0 ? new SimpleTextPostingsFormat() : TestUtil.getDefaultPostingsFormat(),
TestUtil.getDefaultPostingsFormat(),
new AssertingPostingsFormat(),
new MemoryPostingsFormat(true, random.nextFloat()),
new MemoryPostingsFormat(false, random.nextFloat()));
@ -223,7 +212,7 @@ public class RandomCodec extends AssertingCodec {
TestUtil.getDefaultDocValuesFormat(),
new DirectDocValuesFormat(), // maybe not a great idea...
new MemoryDocValuesFormat(),
random.nextInt(10) == 0 ? new SimpleTextDocValuesFormat() : TestUtil.getDefaultDocValuesFormat(),
TestUtil.getDefaultDocValuesFormat(),
new AssertingDocValuesFormat());
Collections.shuffle(formats, random);

View File

@ -17,7 +17,6 @@
-->
<schema name="codec" version="1.2">
<fieldType name="string_direct" class="solr.StrField" postingsFormat="Direct"/>
<fieldType name="string_simpletext" class="solr.StrField" postingsFormat="SimpleText"/>
<fieldType name="string_standard" class="solr.StrField" postingsFormat="Lucene50"/>
<fieldType name="string_disk" class="solr.StrField" docValuesFormat="Lucene54"/>
@ -37,7 +36,6 @@
</fieldType>
<field name="string_direct_f" type="string_direct" indexed="true" stored="true"/>
<field name="string_simpletext_f" type="string_simpletext" indexed="true" stored="true"/>
<field name="string_standard_f" type="string_standard" indexed="true" stored="true"/>
<field name="string_disk_f" type="string_disk" indexed="false" stored="false" docValues="true" default=""/>
@ -46,7 +44,6 @@
<field name="string_f" type="string" indexed="true" stored="true" docValues="true" required="true"/>
<field name="text" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_simple" type="string_simpletext" indexed="true" stored="true"/>
<dynamicField name="*_direct" type="string_direct" indexed="true" stored="true"/>
<dynamicField name="*_standard" type="string_standard" indexed="true" stored="true"/>

View File

@ -51,9 +51,6 @@ public class TestCodecSupport extends SolrTestCaseJ4 {
SchemaField schemaField = fields.get("string_direct_f");
PerFieldPostingsFormat format = (PerFieldPostingsFormat) codec.postingsFormat();
assertEquals("Direct", format.getPostingsFormatForField(schemaField.getName()).getName());
schemaField = fields.get("string_simpletext_f");
assertEquals("SimpleText",
format.getPostingsFormatForField(schemaField.getName()).getName());
schemaField = fields.get("string_standard_f");
assertEquals(TestUtil.getDefaultPostingsFormat().getName(), format.getPostingsFormatForField(schemaField.getName()).getName());
schemaField = fields.get("string_f");
@ -78,8 +75,6 @@ public class TestCodecSupport extends SolrTestCaseJ4 {
Codec codec = h.getCore().getCodec();
PerFieldPostingsFormat format = (PerFieldPostingsFormat) codec.postingsFormat();
assertEquals("SimpleText", format.getPostingsFormatForField("foo_simple").getName());
assertEquals("SimpleText", format.getPostingsFormatForField("bar_simple").getName());
assertEquals("Direct", format.getPostingsFormatForField("foo_direct").getName());
assertEquals("Direct", format.getPostingsFormatForField("bar_direct").getName());
assertEquals(TestUtil.getDefaultPostingsFormat().getName(), format.getPostingsFormatForField("foo_standard").getName());