Improve translog corruption detection (#42980)

Today we test for translog corruption by incrementing a byte by 1 somewhere in
a file, and verify that this leads to a `TranslogCorruptionException`.
However, we rely on _all_ corruptions leading to this exception in the
`RemoveCorruptedShardDataCommand`: this command fails if a translog file
corruption leads to a different kind of exception, and `EOFException` and
`NegativeArraySizeException` are both possible. This commit strengthens the
translog corruption detection tests by simulating the following:

- a random value is written
- the file is truncated

It also makes sure that we return a `TranslogCorruptionException` in all such
cases.

Fixes #42661
Backport of #42744
This commit is contained in:
David Turner 2019-06-07 20:28:02 +01:00 committed by GitHub
parent 479a1eeff6
commit 5bc0dfce94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 130 additions and 96 deletions

View File

@ -148,7 +148,7 @@ public abstract class BaseTranslogReader implements Comparable<BaseTranslogReade
} }
/** /**
* Reads a single opertation from the given location. * Reads a single operation from the given location.
*/ */
Translog.Operation read(Translog.Location location) throws IOException { Translog.Operation read(Translog.Location location) throws IOException {
assert location.generation == this.generation : "generation mismatch expected: " + generation + " got: " + location.generation; assert location.generation == this.generation : "generation mismatch expected: " + generation + " got: " + location.generation;

View File

@ -30,6 +30,7 @@ import org.elasticsearch.common.io.Channels;
import org.elasticsearch.common.io.stream.InputStreamStreamInput; import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput; import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import java.io.EOFException;
import java.io.IOException; import java.io.IOException;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Path; import java.nio.file.Path;
@ -108,6 +109,7 @@ final class TranslogHeader {
* Read a translog header from the given path and file channel * Read a translog header from the given path and file channel
*/ */
static TranslogHeader read(final String translogUUID, final Path path, final FileChannel channel) throws IOException { static TranslogHeader read(final String translogUUID, final Path path, final FileChannel channel) throws IOException {
try {
// This input is intentionally not closed because closing it will close the FileChannel. // This input is intentionally not closed because closing it will close the FileChannel.
final BufferedChecksumStreamInput in = final BufferedChecksumStreamInput in =
new BufferedChecksumStreamInput( new BufferedChecksumStreamInput(
@ -126,9 +128,10 @@ final class TranslogHeader {
// Read the translogUUID // Read the translogUUID
final int uuidLen = in.readInt(); final int uuidLen = in.readInt();
if (uuidLen > channel.size()) { if (uuidLen > channel.size()) {
throw new TranslogCorruptedException( throw new TranslogCorruptedException(path.toString(), "UUID length can't be larger than the translog");
path.toString(), }
"UUID length can't be larger than the translog"); if (uuidLen <= 0) {
throw new TranslogCorruptedException(path.toString(), "UUID length must be positive");
} }
final BytesRef uuid = new BytesRef(uuidLen); final BytesRef uuid = new BytesRef(uuidLen);
uuid.length = uuidLen; uuid.length = uuidLen;
@ -158,6 +161,9 @@ final class TranslogHeader {
assert channel.position() == headerSizeInBytes : assert channel.position() == headerSizeInBytes :
"Header is not fully read; header size [" + headerSizeInBytes + "], position [" + channel.position() + "]"; "Header is not fully read; header size [" + headerSizeInBytes + "], position [" + channel.position() + "]";
return new TranslogHeader(translogUUID, primaryTerm, headerSizeInBytes); return new TranslogHeader(translogUUID, primaryTerm, headerSizeInBytes);
} catch (EOFException e) {
throw new TranslogCorruptedException(path.toString(), "translog header truncated", e);
}
} }
private static void tryReportOldVersionError(final Path path, final FileChannel channel) throws IOException { private static void tryReportOldVersionError(final Path path, final FileChannel channel) throws IOException {

View File

@ -76,7 +76,7 @@ final class TranslogSnapshot extends BaseTranslogReader {
return null; return null;
} }
protected Translog.Operation readOperation() throws IOException { private Translog.Operation readOperation() throws IOException {
final int opSize = readSize(reusableBuffer, position); final int opSize = readSize(reusableBuffer, position);
reuse = checksummedStream(reusableBuffer, position, opSize, reuse); reuse = checksummedStream(reusableBuffer, position, opSize, reuse);
Translog.Operation op = read(reuse); Translog.Operation op = read(reuse);
@ -93,6 +93,7 @@ final class TranslogSnapshot extends BaseTranslogReader {
* reads an operation at the given position into the given buffer. * reads an operation at the given position into the given buffer.
*/ */
protected void readBytes(ByteBuffer buffer, long position) throws IOException { protected void readBytes(ByteBuffer buffer, long position) throws IOException {
try {
if (position >= length) { if (position >= length) {
throw new EOFException("read requested past EOF. pos [" + position + "] end: [" + length + "], generation: [" + throw new EOFException("read requested past EOF. pos [" + position + "] end: [" + length + "], generation: [" +
getGeneration() + "], path: [" + path + "]"); getGeneration() + "], path: [" + path + "]");
@ -102,6 +103,9 @@ final class TranslogSnapshot extends BaseTranslogReader {
getFirstOperationOffset() + "], generation: [" + getGeneration() + "], path: [" + path + "]"); getFirstOperationOffset() + "], generation: [" + getGeneration() + "], path: [" + path + "]");
} }
Channels.readFromFileChannelWithEofException(channel, position, buffer); Channels.readFromFileChannelWithEofException(channel, position, buffer);
} catch (EOFException e) {
throw new TranslogCorruptedException(path.toString(), "translog truncated", e);
}
} }
@Override @Override

View File

@ -168,7 +168,6 @@ public class TruncateTranslogAction {
private boolean isTranslogClean(ShardPath shardPath, String translogUUID) throws IOException { private boolean isTranslogClean(ShardPath shardPath, String translogUUID) throws IOException {
// perform clean check of translog instead of corrupted marker file // perform clean check of translog instead of corrupted marker file
boolean clean = true;
try { try {
final Path translogPath = shardPath.resolveTranslog(); final Path translogPath = shardPath.resolveTranslog();
final long translogGlobalCheckpoint = Translog.readGlobalCheckpoint(translogPath, translogUUID); final long translogGlobalCheckpoint = Translog.readGlobalCheckpoint(translogPath, translogUUID);
@ -184,18 +183,19 @@ public class TruncateTranslogAction {
try (Translog translog = new Translog(translogConfig, translogUUID, try (Translog translog = new Translog(translogConfig, translogUUID,
translogDeletionPolicy, () -> translogGlobalCheckpoint, () -> primaryTerm); translogDeletionPolicy, () -> translogGlobalCheckpoint, () -> primaryTerm);
Translog.Snapshot snapshot = translog.newSnapshot()) { Translog.Snapshot snapshot = translog.newSnapshot()) {
//noinspection StatementWithEmptyBody we are just checking that we can iterate through the whole snapshot
while (snapshot.next() != null) { while (snapshot.next() != null) {
// just iterate over snapshot
} }
} }
return true;
} catch (TranslogCorruptedException e) { } catch (TranslogCorruptedException e) {
clean = false; return false;
} }
return clean;
} }
/** Write a checkpoint file to the given location with the given generation */ /** Write a checkpoint file to the given location with the given generation */
static void writeEmptyCheckpoint(Path filename, int translogLength, long translogGeneration, long globalCheckpoint) throws IOException { private static void writeEmptyCheckpoint(Path filename, int translogLength, long translogGeneration, long globalCheckpoint)
throws IOException {
Checkpoint emptyCheckpoint = Checkpoint.emptyTranslogCheckpoint(translogLength, translogGeneration, Checkpoint emptyCheckpoint = Checkpoint.emptyTranslogCheckpoint(translogLength, translogGeneration,
globalCheckpoint, translogGeneration); globalCheckpoint, translogGeneration);
Checkpoint.write(FileChannel::open, filename, emptyCheckpoint, Checkpoint.write(FileChannel::open, filename, emptyCheckpoint,
@ -234,7 +234,7 @@ public class TruncateTranslogAction {
} }
/** Return a Set of all files in a given directory */ /** Return a Set of all files in a given directory */
public static Set<Path> filesInDirectory(Path directory) throws IOException { private static Set<Path> filesInDirectory(Path directory) throws IOException {
Set<Path> files = new TreeSet<>(); Set<Path> files = new TreeSet<>();
try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) { try (DirectoryStream<Path> stream = Files.newDirectoryStream(directory)) {
for (Path file : stream) { for (Path file : stream) {

View File

@ -52,6 +52,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -67,9 +68,7 @@ public class RemoveCorruptedShardDataCommandTests extends IndexShardTestCase {
private ShardId shardId; private ShardId shardId;
private ShardRouting routing; private ShardRouting routing;
private Path dataDir;
private Environment environment; private Environment environment;
private Settings settings;
private ShardPath shardPath; private ShardPath shardPath;
private IndexMetaData indexMetaData; private IndexMetaData indexMetaData;
private IndexShard indexShard; private IndexShard indexShard;
@ -86,7 +85,7 @@ public class RemoveCorruptedShardDataCommandTests extends IndexShardTestCase {
routing = TestShardRouting.newShardRouting(shardId, nodeId, true, ShardRoutingState.INITIALIZING, routing = TestShardRouting.newShardRouting(shardId, nodeId, true, ShardRoutingState.INITIALIZING,
RecoverySource.EmptyStoreRecoverySource.INSTANCE); RecoverySource.EmptyStoreRecoverySource.INSTANCE);
dataDir = createTempDir(); final Path dataDir = createTempDir();
environment = environment =
TestEnvironment.newEnvironment(Settings.builder() TestEnvironment.newEnvironment(Settings.builder()
@ -96,7 +95,7 @@ public class RemoveCorruptedShardDataCommandTests extends IndexShardTestCase {
// create same directory structure as prod does // create same directory structure as prod does
final Path path = NodeEnvironment.resolveNodePath(dataDir, 0); final Path path = NodeEnvironment.resolveNodePath(dataDir, 0);
Files.createDirectories(path); Files.createDirectories(path);
settings = Settings.builder() final Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(MergePolicyConfig.INDEX_MERGE_ENABLED, false) .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
@ -217,7 +216,7 @@ public class RemoveCorruptedShardDataCommandTests extends IndexShardTestCase {
// close shard // close shard
closeShards(indexShard); closeShards(indexShard);
TestTranslog.corruptRandomTranslogFile(logger, random(), Arrays.asList(translogPath)); TestTranslog.corruptRandomTranslogFile(logger, random(), Collections.singletonList(translogPath));
// test corrupted shard // test corrupted shard
final IndexShard corruptedShard = reopenIndexShard(true); final IndexShard corruptedShard = reopenIndexShard(true);
@ -283,7 +282,7 @@ public class RemoveCorruptedShardDataCommandTests extends IndexShardTestCase {
expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true)); expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true));
closeShards(corruptedShard); closeShards(corruptedShard);
} }
TestTranslog.corruptRandomTranslogFile(logger, random(), Arrays.asList(translogPath)); TestTranslog.corruptRandomTranslogFile(logger, random(), Collections.singletonList(translogPath));
final RemoveCorruptedShardDataCommand command = new RemoveCorruptedShardDataCommand(); final RemoveCorruptedShardDataCommand command = new RemoveCorruptedShardDataCommand();
final MockTerminal t = new MockTerminal(); final MockTerminal t = new MockTerminal();

View File

@ -47,6 +47,8 @@ import java.util.regex.Pattern;
import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.core.Is.is; import static org.hamcrest.core.Is.is;
import static org.hamcrest.core.IsNot.not; import static org.hamcrest.core.IsNot.not;
@ -54,7 +56,7 @@ import static org.hamcrest.core.IsNot.not;
* Helpers for testing translog. * Helpers for testing translog.
*/ */
public class TestTranslog { public class TestTranslog {
static final Pattern TRANSLOG_FILE_PATTERN = Pattern.compile("translog-(\\d+)\\.tlog"); private static final Pattern TRANSLOG_FILE_PATTERN = Pattern.compile("translog-(\\d+)\\.tlog");
public static void corruptRandomTranslogFile(Logger logger, Random random, Collection<Path> translogDirs) throws IOException { public static void corruptRandomTranslogFile(Logger logger, Random random, Collection<Path> translogDirs) throws IOException {
for (Path translogDir : translogDirs) { for (Path translogDir : translogDirs) {
@ -65,12 +67,11 @@ public class TestTranslog {
/** /**
* Corrupts random translog file (translog-N.tlog) from the given translog directory. * Corrupts random translog file (translog-N.tlog) from the given translog directory.
*
* @return a translog file which has been corrupted.
*/ */
public static Path corruptRandomTranslogFile(Logger logger, Random random, Path translogDir, long minGeneration) throws IOException { public static void corruptRandomTranslogFile(Logger logger, Random random, Path translogDir, long minGeneration)
throws IOException {
Set<Path> candidates = new TreeSet<>(); // TreeSet makes sure iteration order is deterministic Set<Path> candidates = new TreeSet<>(); // TreeSet makes sure iteration order is deterministic
logger.info("--> Translog dir [{}], minUsedTranslogGen [{}]", translogDir, minGeneration); logger.info("--> corruptRandomTranslogFile: translogDir [{}], minUsedTranslogGen [{}]", translogDir, minGeneration);
try (DirectoryStream<Path> stream = Files.newDirectoryStream(translogDir)) { try (DirectoryStream<Path> stream = Files.newDirectoryStream(translogDir)) {
for (Path item : stream) { for (Path item : stream) {
if (Files.isRegularFile(item)) { if (Files.isRegularFile(item)) {
@ -81,41 +82,51 @@ public class TestTranslog {
} }
} }
} }
assertThat(candidates, is(not(empty()))); assertThat("no translog files found in " + translogDir, candidates, is(not(empty())));
Path corruptedFile = RandomPicks.randomFrom(random, candidates); Path corruptedFile = RandomPicks.randomFrom(random, candidates);
corruptFile(logger, random, corruptedFile); corruptFile(logger, random, corruptedFile);
return corruptedFile;
} }
static void corruptFile(Logger logger, Random random, Path fileToCorrupt) throws IOException { static void corruptFile(Logger logger, Random random, Path fileToCorrupt) throws IOException {
try (FileChannel raf = FileChannel.open(fileToCorrupt, StandardOpenOption.READ, StandardOpenOption.WRITE)) { final long fileSize = Files.size(fileToCorrupt);
assertThat("cannot corrupt empty file " + fileToCorrupt, fileSize, greaterThan(0L));
try (FileChannel fileChannel = FileChannel.open(fileToCorrupt, StandardOpenOption.READ, StandardOpenOption.WRITE)) {
final long corruptPosition = RandomNumbers.randomLongBetween(random, 0, fileSize - 1);
if (random.nextBoolean()) {
// read // read
raf.position(RandomNumbers.randomLongBetween(random, 0, raf.size() - 1)); fileChannel.position(corruptPosition);
long filePointer = raf.position(); assertThat(fileChannel.position(), equalTo(corruptPosition));
ByteBuffer bb = ByteBuffer.wrap(new byte[1]); ByteBuffer bb = ByteBuffer.wrap(new byte[1]);
raf.read(bb); fileChannel.read(bb);
bb.flip(); bb.flip();
// corrupt // corrupt
byte oldValue = bb.get(0); byte oldValue = bb.get(0);
byte newValue = (byte) (oldValue + 1); byte newValue;
do {
newValue = (byte) random.nextInt(0x100);
} while (newValue == oldValue);
bb.put(0, newValue); bb.put(0, newValue);
// rewrite // rewrite
raf.position(filePointer); fileChannel.position(corruptPosition);
raf.write(bb); fileChannel.write(bb);
logger.info("--> corrupting file {} -- flipping at position {} from {} to {} file: {}", logger.info("--> corrupting file {} at position {} turning 0x{} into 0x{}", fileToCorrupt, corruptPosition,
fileToCorrupt, filePointer, Integer.toHexString(oldValue), Integer.toHexString(oldValue & 0xff), Integer.toHexString(newValue & 0xff));
Integer.toHexString(newValue), fileToCorrupt); } else {
logger.info("--> truncating file {} from length {} to length {}", fileToCorrupt, fileSize, corruptPosition);
fileChannel.truncate(corruptPosition);
}
} }
} }
/** /**
* Lists all existing commits in a given index path, then read the minimum translog generation that will be used in recoverFromTranslog. * Lists all existing commits in a given index path, then read the minimum translog generation that will be used in recoverFromTranslog.
*/ */
public static long minTranslogGenUsedInRecovery(Path translogPath) throws IOException { private static long minTranslogGenUsedInRecovery(Path translogPath) throws IOException {
try (NIOFSDirectory directory = new NIOFSDirectory(translogPath.getParent().resolve("index"))) { try (NIOFSDirectory directory = new NIOFSDirectory(translogPath.getParent().resolve("index"))) {
List<IndexCommit> commits = DirectoryReader.listCommits(directory); List<IndexCommit> commits = DirectoryReader.listCommits(directory);
final String translogUUID = commits.get(commits.size() - 1).getUserData().get(Translog.TRANSLOG_UUID_KEY); final String translogUUID = commits.get(commits.size() - 1).getUserData().get(Translog.TRANSLOG_UUID_KEY);

View File

@ -34,6 +34,7 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.lessThan; import static org.hamcrest.Matchers.lessThan;
@ -62,12 +63,23 @@ public class TranslogHeaderTests extends ESTestCase {
}); });
assertThat(mismatchUUID.getMessage(), containsString("this translog file belongs to a different translog")); assertThat(mismatchUUID.getMessage(), containsString("this translog file belongs to a different translog"));
int corruptions = between(1, 10); int corruptions = between(1, 10);
for (int i = 0; i < corruptions; i++) { for (int i = 0; i < corruptions && Files.size(translogFile) > 0; i++) {
TestTranslog.corruptFile(logger, random(), translogFile); TestTranslog.corruptFile(logger, random(), translogFile);
} }
expectThrows(TranslogCorruptedException.class, () -> { expectThrows(TranslogCorruptedException.class, () -> {
try (FileChannel channel = FileChannel.open(translogFile, StandardOpenOption.READ)) { try (FileChannel channel = FileChannel.open(translogFile, StandardOpenOption.READ)) {
TranslogHeader.read(outHeader.getTranslogUUID(), translogFile, channel); final TranslogHeader translogHeader = TranslogHeader.read(outHeader.getTranslogUUID(), translogFile, channel);
// succeeds if the corruption corrupted the version byte making this look like a v2 translog, because we don't check the
// checksum on this version
assertThat("version " + TranslogHeader.VERSION_CHECKPOINTS + " translog",
translogHeader.getPrimaryTerm(), equalTo(SequenceNumbers.UNASSIGNED_PRIMARY_TERM));
throw new TranslogCorruptedException(translogFile.toString(), "adjusted translog version");
} catch (IllegalStateException e) {
// corruption corrupted the version byte making this look like a v2, v1 or v0 translog
assertThat("version " + TranslogHeader.VERSION_CHECKPOINTS + "-or-earlier translog",
e.getMessage(), anyOf(containsString("pre-2.0 translog found"), containsString("pre-1.4 translog found"),
containsString("pre-6.3 translog found")));
throw new TranslogCorruptedException(translogFile.toString(), "adjusted translog version", e);
} }
}); });
} }

View File

@ -128,6 +128,7 @@ import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.hasToString; import static org.hamcrest.Matchers.hasToString;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.isIn; import static org.hamcrest.Matchers.isIn;
import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.hamcrest.Matchers.lessThanOrEqualTo;
@ -882,7 +883,8 @@ public class TranslogTests extends ESTestCase {
for (int i = 0; i < locations.size(); i++) { for (int i = 0; i < locations.size(); i++) {
try { try {
assertNotNull(snap.next()); assertNotNull(snap.next());
} catch (EOFException e) { } catch (TranslogCorruptedException e) {
assertThat(e.getCause(), instanceOf(EOFException.class));
truncations.incrementAndGet(); truncations.incrementAndGet();
} }
} }