[TEST] Test that translog can recover after random IOException

This commit adds a new test that can throw an IOException at any point in time
and ensures that all previously synced documents can be successfully recovered after hitting
an excepiton.

Relates to #15788
This commit is contained in:
Simon Willnauer 2016-01-07 10:05:53 +01:00
parent 132df10342
commit e7f9d685f1
3 changed files with 179 additions and 19 deletions

View File

@ -51,6 +51,7 @@ import org.elasticsearch.index.shard.IndexShardComponent;
import java.io.Closeable; import java.io.Closeable;
import java.io.EOFException; import java.io.EOFException;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.DirectoryStream; import java.nio.file.DirectoryStream;
import java.nio.file.Files; import java.nio.file.Files;
@ -440,7 +441,7 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
if (config.isSyncOnEachOperation()) { if (config.isSyncOnEachOperation()) {
current.sync(); current.sync();
} }
assert current.assertBytesAtLocation(location, bytes); assert assertBytesAtLocation(location, bytes);
return location; return location;
} }
} catch (AlreadyClosedException | IOException ex) { } catch (AlreadyClosedException | IOException ex) {
@ -454,6 +455,13 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
} }
} }
boolean assertBytesAtLocation(Translog.Location location, BytesReference expectedBytes) throws IOException {
// tests can override this
ByteBuffer buffer = ByteBuffer.allocate(location.size);
current.readBytes(buffer, location.translogLocation);
return new BytesArray(buffer.array()).equals(expectedBytes);
}
/** /**
* Snapshots the current transaction log allowing to safely iterate over the snapshot. * Snapshots the current transaction log allowing to safely iterate over the snapshot.
* Snapshots are fixed in time and will not be updated with future operations. * Snapshots are fixed in time and will not be updated with future operations.

View File

@ -218,11 +218,6 @@ public class TranslogWriter extends TranslogReader {
} }
} }
boolean assertBytesAtLocation(Translog.Location location, BytesReference expectedBytes) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(location.size);
readBytes(buffer, location.translogLocation);
return new BytesArray(buffer.array()).equals(expectedBytes);
}
private long getWrittenOffset() throws IOException { private long getWrittenOffset() throws IOException {
return channelReference.getChannel().position(); return channelReference.getChannel().position();

View File

@ -32,6 +32,7 @@ import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.FileSystemUtils; import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamInput;
@ -1316,7 +1317,7 @@ public class TranslogTests extends ESTestCase {
public void testFailFlush() throws IOException { public void testFailFlush() throws IOException {
Path tempDir = createTempDir(); Path tempDir = createTempDir();
final AtomicBoolean fail = new AtomicBoolean(); final FailSwitch fail = new FailSwitch();
TranslogConfig config = getTranslogConfig(tempDir); TranslogConfig config = getTranslogConfig(tempDir);
Translog translog = getFailableTranslog(fail, config); Translog translog = getFailableTranslog(fail, config);
@ -1336,9 +1337,13 @@ public class TranslogTests extends ESTestCase {
assertFalse(translog.isOpen()); assertFalse(translog.isOpen());
assertEquals("__FAKE__ no space left on device", ex.getMessage()); assertEquals("__FAKE__ no space left on device", ex.getMessage());
} }
fail.set(randomBoolean()); if (randomBoolean()) {
fail.failAlways();
} else {
fail.failNever();
}
} }
fail.set(false); fail.failNever();
if (randomBoolean()) { if (randomBoolean()) {
try { try {
locations.add(translog.add(new Translog.Index("test", "" + opsSynced, Integer.toString(opsSynced).getBytes(Charset.forName("UTF-8"))))); locations.add(translog.add(new Translog.Index("test", "" + opsSynced, Integer.toString(opsSynced).getBytes(Charset.forName("UTF-8")))));
@ -1409,13 +1414,13 @@ public class TranslogTests extends ESTestCase {
public void testTragicEventCanBeAnyException() throws IOException { public void testTragicEventCanBeAnyException() throws IOException {
Path tempDir = createTempDir(); Path tempDir = createTempDir();
final AtomicBoolean fail = new AtomicBoolean(); final FailSwitch fail = new FailSwitch();
TranslogConfig config = getTranslogConfig(tempDir); TranslogConfig config = getTranslogConfig(tempDir);
assumeFalse("this won't work if we sync on any op", config.isSyncOnEachOperation()); assumeFalse("this won't work if we sync on any op", config.isSyncOnEachOperation());
Translog translog = getFailableTranslog(fail, config, false, true); Translog translog = getFailableTranslog(fail, config, false, true);
LineFileDocs lineFileDocs = new LineFileDocs(random()); // writes pretty big docs so we cross buffer boarders regularly LineFileDocs lineFileDocs = new LineFileDocs(random()); // writes pretty big docs so we cross buffer boarders regularly
translog.add(new Translog.Index("test", "1", lineFileDocs.nextDoc().toString().getBytes(Charset.forName("UTF-8")))); translog.add(new Translog.Index("test", "1", lineFileDocs.nextDoc().toString().getBytes(Charset.forName("UTF-8"))));
fail.set(true); fail.failAlways();
try { try {
Translog.Location location = translog.add(new Translog.Index("test", "2", lineFileDocs.nextDoc().toString().getBytes(Charset.forName("UTF-8")))); Translog.Location location = translog.add(new Translog.Index("test", "2", lineFileDocs.nextDoc().toString().getBytes(Charset.forName("UTF-8"))));
if (randomBoolean()) { if (randomBoolean()) {
@ -1436,7 +1441,7 @@ public class TranslogTests extends ESTestCase {
public void testFatalIOExceptionsWhileWritingConcurrently() throws IOException, InterruptedException { public void testFatalIOExceptionsWhileWritingConcurrently() throws IOException, InterruptedException {
Path tempDir = createTempDir(); Path tempDir = createTempDir();
final AtomicBoolean fail = new AtomicBoolean(false); final FailSwitch fail = new FailSwitch();
TranslogConfig config = getTranslogConfig(tempDir); TranslogConfig config = getTranslogConfig(tempDir);
Translog translog = getFailableTranslog(fail, config); Translog translog = getFailableTranslog(fail, config);
@ -1473,7 +1478,7 @@ public class TranslogTests extends ESTestCase {
// this holds a reference to the current tlog channel such that it's not closed // this holds a reference to the current tlog channel such that it's not closed
// if we hit a tragic event. this is important to ensure that asserts inside the Translog#add doesn't trip // if we hit a tragic event. this is important to ensure that asserts inside the Translog#add doesn't trip
// otherwise our assertions here are off by one sometimes. // otherwise our assertions here are off by one sometimes.
fail.set(true); fail.failAlways();
for (int i = 0; i < threadCount; i++) { for (int i = 0; i < threadCount; i++) {
threads[i].join(); threads[i].join();
} }
@ -1525,11 +1530,40 @@ public class TranslogTests extends ESTestCase {
} }
} }
private Translog getFailableTranslog(final AtomicBoolean fail, final TranslogConfig config) throws IOException { private Translog getFailableTranslog(FailSwitch fail, final TranslogConfig config) throws IOException {
return getFailableTranslog(fail, config, randomBoolean(), false); return getFailableTranslog(fail, config, randomBoolean(), false);
} }
private Translog getFailableTranslog(final AtomicBoolean fail, final TranslogConfig config, final boolean paritalWrites, final boolean throwUnknownException) throws IOException { private static class FailSwitch {
private volatile int failRate;
private volatile boolean onceFailedFailAlways = false;
public boolean fail() {
boolean fail = randomIntBetween(1, 100) <= failRate;
if (fail && onceFailedFailAlways) {
failAlways();
}
return fail;
}
public void failNever() {
failRate = 0;
}
public void failAlways() {
failRate = 100;
}
public void failRandomly() {
failRate = randomIntBetween(1, 100);
}
public void onceFailedFailAlways() {
onceFailedFailAlways = true;
}
}
private Translog getFailableTranslog(final FailSwitch fail, final TranslogConfig config, final boolean paritalWrites, final boolean throwUnknownException) throws IOException {
return new Translog(config) { return new Translog(config) {
@Override @Override
TranslogWriter.ChannelFactory getChannelFactory() { TranslogWriter.ChannelFactory getChannelFactory() {
@ -1539,23 +1573,56 @@ public class TranslogTests extends ESTestCase {
@Override @Override
public FileChannel open(Path file) throws IOException { public FileChannel open(Path file) throws IOException {
FileChannel channel = factory.open(file); FileChannel channel = factory.open(file);
return new ThrowingFileChannel(fail, paritalWrites, throwUnknownException, channel); boolean success = false;
try {
ThrowingFileChannel throwingFileChannel = new ThrowingFileChannel(fail, paritalWrites, throwUnknownException, channel);
success = true;
return throwingFileChannel;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(channel);
}
}
} }
}; };
} }
@Override
protected boolean assertBytesAtLocation(Location location, BytesReference expectedBytes) throws IOException {
return true; // we don't wanna fail in the assert
}
}; };
} }
public static class ThrowingFileChannel extends FilterFileChannel { public static class ThrowingFileChannel extends FilterFileChannel {
private final AtomicBoolean fail; private final FailSwitch fail;
private final boolean partialWrite; private final boolean partialWrite;
private final boolean throwUnknownException; private final boolean throwUnknownException;
public ThrowingFileChannel(AtomicBoolean fail, boolean partialWrite, boolean throwUnknownException, FileChannel delegate) { public ThrowingFileChannel(FailSwitch fail, boolean partialWrite, boolean throwUnknownException, FileChannel delegate) throws MockDirectoryWrapper.FakeIOException {
super(delegate); super(delegate);
this.fail = fail; this.fail = fail;
this.partialWrite = partialWrite; this.partialWrite = partialWrite;
this.throwUnknownException = throwUnknownException; this.throwUnknownException = throwUnknownException;
if (fail.fail()) {
throw new MockDirectoryWrapper.FakeIOException();
}
}
@Override
public int read(ByteBuffer dst) throws IOException {
if (fail.fail()) {
throw new MockDirectoryWrapper.FakeIOException();
}
return super.read(dst);
}
@Override
public long read(ByteBuffer[] dsts, int offset, int length) throws IOException {
if (fail.fail()) {
throw new MockDirectoryWrapper.FakeIOException();
}
return super.read(dsts, offset, length);
} }
@Override @Override
@ -1570,7 +1637,7 @@ public class TranslogTests extends ESTestCase {
public int write(ByteBuffer src) throws IOException { public int write(ByteBuffer src) throws IOException {
if (fail.get()) { if (fail.fail()) {
if (partialWrite) { if (partialWrite) {
if (src.hasRemaining()) { if (src.hasRemaining()) {
final int pos = src.position(); final int pos = src.position();
@ -1590,6 +1657,22 @@ public class TranslogTests extends ESTestCase {
} }
return super.write(src); return super.write(src);
} }
@Override
public void force(boolean metaData) throws IOException {
if (fail.fail()) {
throw new MockDirectoryWrapper.FakeIOException();
}
super.force(metaData);
}
@Override
public long position() throws IOException {
if (fail.fail()) {
throw new MockDirectoryWrapper.FakeIOException();
}
return super.position();
}
} }
private static final class UnknownException extends RuntimeException { private static final class UnknownException extends RuntimeException {
@ -1711,4 +1794,78 @@ public class TranslogTests extends ESTestCase {
} }
} }
/**
* This test adds operations to the translog which might randomly throw an IOException. The only thing this test verifies is
* that we can, after we hit an exception, open and recover the translog successfully and retrieve all successfully synced operations
* from the transaction log.
*/
public void testWithRandomException() throws IOException {
final int runs = randomIntBetween(5, 10);
for (int run = 0; run < runs; run++) {
Path tempDir = createTempDir();
final FailSwitch fail = new FailSwitch();
fail.failRandomly();
TranslogConfig config = getTranslogConfig(tempDir);
final int numOps = randomIntBetween(100, 200);
List<String> syncedDocs = new ArrayList<>();
List<String> unsynced = new ArrayList<>();
if (randomBoolean()) {
fail.onceFailedFailAlways();
}
try {
final Translog failableTLog = getFailableTranslog(fail, config, randomBoolean(), false);
try {
LineFileDocs lineFileDocs = new LineFileDocs(random()); //writes pretty big docs so we cross buffer boarders regularly
for (int opsAdded = 0; opsAdded < numOps; opsAdded++) {
String doc = lineFileDocs.nextDoc().toString();
failableTLog.add(new Translog.Index("test", "" + opsAdded, doc.getBytes(Charset.forName("UTF-8"))));
unsynced.add(doc);
if (randomBoolean()) {
failableTLog.sync();
syncedDocs.addAll(unsynced);
unsynced.clear();
}
if (randomFloat() < 0.1) {
failableTLog.sync(); // we have to sync here first otherwise we don't know if the sync succeeded if the commit fails
syncedDocs.addAll(unsynced);
unsynced.clear();
if (randomBoolean()) {
failableTLog.prepareCommit();
}
failableTLog.commit();
syncedDocs.clear();
}
}
} catch (TranslogException | MockDirectoryWrapper.FakeIOException ex) {
// fair enough
} catch (IOException ex) {
assertEquals(ex.getMessage(), "__FAKE__ no space left on device");
} finally {
config.setTranslogGeneration(failableTLog.getGeneration());
IOUtils.closeWhileHandlingException(failableTLog);
}
} catch (TranslogException | MockDirectoryWrapper.FakeIOException ex) {
// failed - that's ok, we didn't even create it
}
// now randomly open this failing tlog again just to make sure we can also recover from failing during recovery
if (randomBoolean()) {
try {
IOUtils.close(getFailableTranslog(fail, config, randomBoolean(), false));
} catch (TranslogException | MockDirectoryWrapper.FakeIOException ex) {
// failed - that's ok, we didn't even create it
}
}
try (Translog translog = new Translog(config)) {
try (Translog.Snapshot snapshot = translog.newSnapshot()) {
assertEquals(syncedDocs.size(), snapshot.estimatedTotalOperations());
for (int i = 0; i < syncedDocs.size(); i++) {
Translog.Operation next = snapshot.next();
assertEquals(syncedDocs.get(i), next.getSource().source.toUtf8());
assertNotNull("operation " + i + " must be non-null", next);
}
}
}
}
}
} }