HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

(cherry picked from commit 451265a83d)

Conflicts:
	hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
This commit is contained in:
Kihwal Lee 2018-02-26 10:58:07 -06:00
parent 79af42f095
commit 4b43f2aa56
2 changed files with 46 additions and 4 deletions

View File

@ -293,10 +293,8 @@ public class BlockRecoveryWorker {
}
}
// If any of the data-nodes failed, the recovery fails, because
// we never know the actual state of the replica on failed data-nodes.
// The recovery should be started over.
if (!failedList.isEmpty()) {
// Abort if all failed.
if (successList.isEmpty()) {
StringBuilder b = new StringBuilder();
for(DatanodeID id : failedList) {
b.append("\n " + id);

View File

@ -227,6 +227,50 @@ public class TestLeaseRecovery {
assertEquals(newFileLen, expectedNewFileLen);
}
/**
* Block/lease recovery should be retried with failed nodes from the second
* stage removed to avoid perpetual recovery failures.
*/
@Test
public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
Configuration conf = new Configuration();
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
DistributedFileSystem dfs = cluster.getFileSystem();
// Create a file.
FSDataOutputStream out = dfs.create(file);
final int FILE_SIZE = 128 * 1024;
int count = 0;
while (count < FILE_SIZE) {
out.writeBytes("DE K9SUL");
count += 8;
}
out.hsync();
// Abort the original stream.
((DFSOutputStream) out.getWrappedStream()).abort();
LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
file.toString(), 0, count);
ExtendedBlock block = locations.get(0).getBlock();
// Finalize one replica to simulate a partial close failure.
cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
// Delete the meta file to simulate a rename/move failure.
cluster.deleteMeta(0, block);
// Try to recover the lease.
DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
.newInstance(cluster.getConfiguration(0));
count = 0;
while (count++ < 15 && !newDfs.recoverLease(file)) {
Thread.sleep(1000);
}
// The lease should have been recovered.
assertTrue("File should be closed", newDfs.recoverLease(file));
}
/**
* Recover the lease on a file and append file from another client.
*/