From 069366e1beaf57c26e19eb63eb5bc08e8d24562f Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Wed, 8 Apr 2015 16:09:17 -0700 Subject: [PATCH] HDFS-8025. Addendum fix for HDFS-3087 Decomissioning on NN restart can complete without blocks being replicated. Contributed by Ming Ma. --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../server/blockmanagement/BlockManager.java | 5 +++ .../apache/hadoop/hdfs/TestDecommission.java | 32 +++++++------------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 26117e9859c..d10123f6e5a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -84,6 +84,9 @@ Release 2.8.0 - UNRELEASED HDFS-8076. Code cleanup for DFSInputStream: use offset instead of LocatedBlock when possible. (Zhe Zhang via wang) + HDFS-8025. Addendum fix for HDFS-3087 Decomissioning on NN restart can + complete without blocks being replicated. (Ming Ma via wang) + OPTIMIZATIONS HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index b2babf9a757..fd0db8cacf2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -3308,6 +3308,11 @@ public class BlockManager { * liveness. Dead nodes cannot always be safely decommissioned. */ boolean isNodeHealthyForDecommission(DatanodeDescriptor node) { + if (!node.checkBlockReportReceived()) { + LOG.info("Node {} hasn't sent its first block report.", node); + return false; + } + if (node.isAlive) { return true; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 081e40f6278..1ab7427f97a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -882,9 +882,12 @@ public class TestDecommission { int numNamenodes = 1; int numDatanodes = 1; int replicas = 1; - + conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, + DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_DEFAULT); + conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INITIAL_DELAY_KEY, 5); + startCluster(numNamenodes, numDatanodes, conf); - Path file1 = new Path("testDecommission.dat"); + Path file1 = new Path("testDecommissionWithNamenodeRestart.dat"); FileSystem fileSys = cluster.getFileSystem(); writeFile(fileSys, file1, replicas); @@ -894,37 +897,26 @@ public class TestDecommission { String excludedDatanodeName = info[0].getXferAddr(); writeConfigFile(excludeFile, new ArrayList(Arrays.asList(excludedDatanodeName))); - + //Add a new datanode to cluster cluster.startDataNodes(conf, 1, true, null, null, null, null); numDatanodes+=1; - + assertEquals("Number of datanodes should be 2 ", 2, cluster.getDataNodes().size()); //Restart the namenode cluster.restartNameNode(); DatanodeInfo datanodeInfo = NameNodeAdapter.getDatanode( cluster.getNamesystem(), excludedDatanodeID); waitNodeState(datanodeInfo, AdminStates.DECOMMISSIONED); - + // Ensure decommissioned datanode is not automatically shutdown assertEquals("All datanodes must be alive", numDatanodes, client.datanodeReport(DatanodeReportType.LIVE).length); - // wait for the block to be replicated - int tries = 0; - while (tries++ < 20) { - try { - Thread.sleep(1000); - if (checkFile(fileSys, file1, replicas, datanodeInfo.getXferAddr(), - numDatanodes) == null) { - break; - } - } catch (InterruptedException ie) { - } - } - assertTrue("Checked if block was replicated after decommission, tried " - + tries + " times.", tries < 20); - cleanupFile(fileSys, file1); + assertTrue("Checked if block was replicated after decommission.", + checkFile(fileSys, file1, replicas, datanodeInfo.getXferAddr(), + numDatanodes) == null); + cleanupFile(fileSys, file1); // Restart the cluster and ensure recommissioned datanodes // are allowed to register with the namenode cluster.shutdown();