From ce5de93a5837e115e1f0b7d3c5a67ace25385a63 Mon Sep 17 00:00:00 2001 From: Konstantin V Shvachko Date: Mon, 16 Mar 2015 12:54:04 -0700 Subject: [PATCH] HDFS-7886. Fix TestFileTruncate falures. Contributed by Plamen Jeliazkov and Konstantin Shvachko. --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 + .../apache/hadoop/hdfs/MiniDFSCluster.java | 44 +++++++++++++++++-- .../server/namenode/TestFileTruncate.java | 18 ++++---- 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 93237af6086..d313b6c1a53 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1157,6 +1157,8 @@ Release 2.7.0 - UNRELEASED HDFS-7915. The DataNode can sometimes allocate a ShortCircuitShm slot and fail to tell the DFSClient about it because of a network error (cmccabe) + HDFS-7886. Fix TestFileTruncate falures. (Plamen Jeliazkov and shv) + BREAKDOWN OF HDFS-7584 SUBTASKS AND RELATED JIRAS HDFS-7720. Quota by Storage Type API, tools and ClientNameNode diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index 834eb325fb5..9208ed2d0c8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -77,9 +77,12 @@ import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.ClientProtocol; +import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Util; @@ -1343,7 +1346,6 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, } int curDatanodesNum = dataNodes.size(); - final int curDatanodesNumSaved = curDatanodesNum; // for mincluster's the default initialDelay for BRs is 0 if (conf.get(DFS_BLOCKREPORT_INITIAL_DELAY_KEY) == null) { conf.setLong(DFS_BLOCKREPORT_INITIAL_DELAY_KEY, 0); @@ -2022,7 +2024,23 @@ public boolean restartDataNode(int i) throws IOException { */ public synchronized boolean restartDataNode(int i, boolean keepPort) throws IOException { - DataNodeProperties dnprop = stopDataNode(i); + return restartDataNode(i, keepPort, false); + } + + /** + * Restart a particular DataNode. + * @param idn index of the DataNode + * @param keepPort true if should restart on the same port + * @param expireOnNN true if NameNode should expire the DataNode heartbeat + * @return + * @throws IOException + */ + public synchronized boolean restartDataNode( + int idn, boolean keepPort, boolean expireOnNN) throws IOException { + DataNodeProperties dnprop = stopDataNode(idn); + if(expireOnNN) { + setDataNodeDead(dnprop.datanode.getDatanodeId()); + } if (dnprop == null) { return false; } else { @@ -2030,6 +2048,24 @@ public synchronized boolean restartDataNode(int i, boolean keepPort) } } + /** + * Expire a DataNode heartbeat on the NameNode + * @param dnId + * @throws IOException + */ + public void setDataNodeDead(DatanodeID dnId) throws IOException { + DatanodeDescriptor dnd = + NameNodeAdapter.getDatanode(getNamesystem(), dnId); + dnd.setLastUpdate(0L); + BlockManagerTestUtil.checkHeartbeat(getNamesystem().getBlockManager()); + } + + public void setDataNodesDead() throws IOException { + for (DataNodeProperties dnp : dataNodes) { + setDataNodeDead(dnp.datanode.getDatanodeId()); + } + } + /* * Restart all datanodes, on the same ports if keepPort is true */ @@ -2255,8 +2291,8 @@ private synchronized boolean shouldWait(DatanodeInfo[] dnInfo, // make sure all datanodes have sent first heartbeat to namenode, // using (capacity == 0) as proxy. for (DatanodeInfo dn : dnInfo) { - if (dn.getCapacity() == 0) { - LOG.info("dn.getCapacity() == 0"); + if (dn.getCapacity() == 0 || dn.getLastUpdate() <= 0) { + LOG.info("No heartbeat from DataNode: " + dn.toString()); return true; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileTruncate.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileTruncate.java index 3b6e1079cff..8d447eeb769 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileTruncate.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileTruncate.java @@ -679,10 +679,10 @@ public void testTruncateWithDataNodesRestart() throws Exception { boolean isReady = fs.truncate(p, newLength); assertFalse(isReady); } finally { - cluster.restartDataNode(dn); + cluster.restartDataNode(dn, true, true); cluster.waitActive(); - cluster.triggerBlockReports(); } + checkBlockRecovery(p); LocatedBlock newBlock = getLocatedBlocks(p).getLastLocatedBlock(); /* @@ -699,7 +699,6 @@ public void testTruncateWithDataNodesRestart() throws Exception { assertEquals(newBlock.getBlock().getGenerationStamp(), oldBlock.getBlock().getGenerationStamp() + 1); - checkBlockRecovery(p); // Wait replicas come to 3 DFSTestUtil.waitReplication(fs, p, REPLICATION); // Old replica is disregarded and replaced with the truncated one @@ -741,10 +740,10 @@ public void testCopyOnTruncateWithDataNodesRestart() throws Exception { boolean isReady = fs.truncate(p, newLength); assertFalse(isReady); } finally { - cluster.restartDataNode(dn); + cluster.restartDataNode(dn, true, true); cluster.waitActive(); - cluster.triggerBlockReports(); } + checkBlockRecovery(p); LocatedBlock newBlock = getLocatedBlocks(p).getLastLocatedBlock(); /* @@ -757,7 +756,6 @@ public void testCopyOnTruncateWithDataNodesRestart() throws Exception { assertEquals(newBlock.getBlock().getGenerationStamp(), oldBlock.getBlock().getGenerationStamp() + 1); - checkBlockRecovery(p); // Wait replicas come to 3 DFSTestUtil.waitReplication(fs, p, REPLICATION); // New block is replicated to dn1 @@ -800,9 +798,10 @@ public void testTruncateWithDataNodesRestartImmediately() throws Exception { boolean isReady = fs.truncate(p, newLength); assertFalse(isReady); - cluster.restartDataNode(dn0); - cluster.restartDataNode(dn1); + cluster.restartDataNode(dn0, true, true); + cluster.restartDataNode(dn1, true, true); cluster.waitActive(); + checkBlockRecovery(p); cluster.triggerBlockReports(); LocatedBlock newBlock = getLocatedBlocks(p).getLastLocatedBlock(); @@ -815,7 +814,6 @@ public void testTruncateWithDataNodesRestartImmediately() throws Exception { assertEquals(newBlock.getBlock().getGenerationStamp(), oldBlock.getBlock().getGenerationStamp() + 1); - checkBlockRecovery(p); // Wait replicas come to 3 DFSTestUtil.waitReplication(fs, p, REPLICATION); // Old replica is disregarded and replaced with the truncated one on dn0 @@ -859,6 +857,7 @@ public void testTruncateWithDataNodesShutdownImmediately() throws Exception { assertFalse(isReady); cluster.shutdownDataNodes(); + cluster.setDataNodesDead(); try { for(int i = 0; i < SUCCESS_ATTEMPTS && cluster.isDataNodeUp(); i++) { Thread.sleep(SLEEP); @@ -871,6 +870,7 @@ public void testTruncateWithDataNodesShutdownImmediately() throws Exception { StartupOption.REGULAR, null); cluster.waitActive(); } + checkBlockRecovery(p); fs.delete(parent, true); }