HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

This commit is contained in:
Kihwal Lee 2018-02-26 10:28:04 -06:00
parent 2fa7963c3d
commit 451265a83d
2 changed files with 46 additions and 4 deletions

View File

@ -307,10 +307,8 @@ void syncBlock(List<BlockRecord> syncList) throws IOException {
} }
} }
// If any of the data-nodes failed, the recovery fails, because // Abort if all failed.
// we never know the actual state of the replica on failed data-nodes. if (successList.isEmpty()) {
// The recovery should be started over.
if (!failedList.isEmpty()) {
throw new IOException("Cannot recover " + block throw new IOException("Cannot recover " + block
+ ", the following datanodes failed: " + failedList); + ", the following datanodes failed: " + failedList);
} }

View File

@ -227,6 +227,50 @@ public void testBlockRecoveryWithLessMetafile() throws Exception {
assertEquals(newFileLen, expectedNewFileLen); assertEquals(newFileLen, expectedNewFileLen);
} }
/**
* Block/lease recovery should be retried with failed nodes from the second
* stage removed to avoid perpetual recovery failures.
*/
@Test
public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
Configuration conf = new Configuration();
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
DistributedFileSystem dfs = cluster.getFileSystem();
// Create a file.
FSDataOutputStream out = dfs.create(file);
final int FILE_SIZE = 128 * 1024;
int count = 0;
while (count < FILE_SIZE) {
out.writeBytes("DE K9SUL");
count += 8;
}
out.hsync();
// Abort the original stream.
((DFSOutputStream) out.getWrappedStream()).abort();
LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
file.toString(), 0, count);
ExtendedBlock block = locations.get(0).getBlock();
// Finalize one replica to simulate a partial close failure.
cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
// Delete the meta file to simulate a rename/move failure.
cluster.deleteMeta(0, block);
// Try to recover the lease.
DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
.newInstance(cluster.getConfiguration(0));
count = 0;
while (count++ < 15 && !newDfs.recoverLease(file)) {
Thread.sleep(1000);
}
// The lease should have been recovered.
assertTrue("File should be closed", newDfs.recoverLease(file));
}
/** /**
* Recover the lease on a file and append file from another client. * Recover the lease on a file and append file from another client.
*/ */