HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

(cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12)

 Conflicts:
    hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

(cherry picked from commit 60be2e5d8a1a6a8921c68f8b0f428b55152d05db)

 Conflicts:
    hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
This commit is contained in:
Wei-Chiu Chuang 2017-03-13 13:45:12 -07:00
parent ef99e5ed89
commit 830a602375
3 changed files with 57 additions and 1 deletions

View File

@ -229,6 +229,9 @@ Release 2.7.4 - UNRELEASED
HDFS-11379. DFSInputStream may infinite loop requesting block locations. Contributed by Daryn Sharp.
HDFS-11499. Decommissioning stuck because of failing recovery.
Contributed by Lukas Majercak and Manoj Govindassamy.
Release 2.7.3 - 2016-08-25
INCOMPATIBLE CHANGES

View File

@ -640,7 +640,12 @@ public boolean commitOrCompleteLastBlock(BlockCollection bc,
final boolean b = commitBlock(
(BlockInfoContiguousUnderConstruction) lastBlock, commitBlock);
if(countNodes(lastBlock).liveReplicas() >= minReplication)
// Count replicas on decommissioning nodes, as these will not be
// decommissioned unless recovery/completing last block has finished
NumberReplicas numReplicas = countNodes(lastBlock);
if(numReplicas.liveReplicas() + numReplicas.decommissioning() >=
minReplication)
completeBlock(bc, bc.numBlocks()-1, iip, false);
return b;
}

View File

@ -870,6 +870,54 @@ public void testDecommissionWithOpenfile() throws IOException, InterruptedExcept
fdos.close();
}
@Test(timeout = 360000)
public void testDecommissionWithOpenFileAndBlockRecovery()
throws IOException, InterruptedException {
startCluster(1, 6, conf);
cluster.waitActive();
Path file = new Path("/testRecoveryDecommission");
// Create a file and never close the output stream to trigger recovery
DistributedFileSystem dfs = cluster.getFileSystem();
FSNamesystem ns = cluster.getNamesystem(0);
FSDataOutputStream out = dfs.create(file, true,
conf.getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096),
(short) 3, blockSize);
// Write data to the file
long writtenBytes = 0;
while (writtenBytes < fileSize) {
out.writeLong(writtenBytes);
writtenBytes += 8;
}
out.hsync();
DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
cluster.getNameNode(), "/testRecoveryDecommission", 0, fileSize)
.getLastLocatedBlock().getLocations();
// Decommission all nodes of the last block
ArrayList<String> toDecom = new ArrayList<>();
for (DatanodeInfo dnDecom : lastBlockLocations) {
toDecom.add(dnDecom.getXferAddr());
}
writeConfigFile(excludeFile, toDecom);
refreshNodes(ns, conf);
// Make sure hard lease expires to trigger replica recovery
cluster.setLeasePeriod(300L, 300L);
Thread.sleep(2 * BLOCKREPORT_INTERVAL_MSEC);
for (DatanodeInfo dnDecom : lastBlockLocations) {
DatanodeInfo datanode = NameNodeAdapter.getDatanode(
cluster.getNamesystem(), dnDecom);
waitNodeState(datanode, AdminStates.DECOMMISSIONED);
}
assertEquals(dfs.getFileStatus(file).getLen(), writtenBytes);
}
/**
* Tests restart of namenode while datanode hosts are added to exclude file
**/