From 1354ec1c74423048bee04ea2472e481f5e4f8095 Mon Sep 17 00:00:00 2001 From: Kihwal Lee Date: Wed, 29 Oct 2014 17:25:51 -0500 Subject: [PATCH] HDFS-7300. The getMaxNodesPerRack() method in BlockPlacementPolicyDefault is flawed. contributed by Kihwal Lee (cherry picked from commit 3ae84e1ba8928879b3eda90e79667ba5a45d60f8) --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 4 ++ .../BlockPlacementPolicyDefault.java | 42 ++++++++++++++-- .../hadoop/hdfs/TestFileAppendRestart.java | 2 +- .../blockmanagement/TestBlockManager.java | 3 +- .../TestReplicationPolicy.java | 49 +++++++++++++++++++ 5 files changed, 95 insertions(+), 5 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 7e465ae35a8..bc396b84786 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -853,6 +853,10 @@ Release 2.6.0 - UNRELEASED HDFS-7287. The OfflineImageViewer (OIV) can output invalid XML depending on the filename (Ravi Prakash via Colin P. McCabe) + HDFS-7300. The getMaxNodesPerRack() method in BlockPlacementPolicyDefault + is flawed (kihwal) + + BREAKDOWN OF HDFS-6584 ARCHIVAL STORAGE HDFS-6677. Change INodeFile and FSImage to support storage policy ID. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java index 99f509e8d04..5b02384737b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java @@ -139,13 +139,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { List results = new ArrayList(); boolean avoidStaleNodes = stats != null && stats.isAvoidingStaleDataNodesForWrite(); + + int maxNodesAndReplicas[] = getMaxNodesPerRack(0, numOfReplicas); + numOfReplicas = maxNodesAndReplicas[0]; + int maxNodesPerRack = maxNodesAndReplicas[1]; + for (int i = 0; i < favoredNodes.size() && results.size() < numOfReplicas; i++) { DatanodeDescriptor favoredNode = favoredNodes.get(i); // Choose a single node which is local to favoredNode. // 'results' is updated within chooseLocalNode final DatanodeStorageInfo target = chooseLocalStorage(favoredNode, - favoriteAndExcludedNodes, blocksize, - getMaxNodesPerRack(results.size(), numOfReplicas)[1], + favoriteAndExcludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes, false); if (target == null) { LOG.warn("Could not find a target for file " + src @@ -221,6 +225,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { results.toArray(new DatanodeStorageInfo[results.size()])); } + /** + * Calculate the maximum number of replicas to allocate per rack. It also + * limits the total number of replicas to the total number of nodes in the + * cluster. Caller should adjust the replica count to the return value. + * + * @param numOfChosen The number of already chosen nodes. + * @param numOfReplicas The number of additional nodes to allocate. + * @return integer array. Index 0: The number of nodes allowed to allocate + * in addition to already chosen nodes. + * Index 1: The maximum allowed number of nodes per rack. This + * is independent of the number of chosen nodes, as it is calculated + * using the target number of replicas. + */ private int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) { int clusterSize = clusterMap.getNumOfLeaves(); int totalNumOfReplicas = numOfChosen + numOfReplicas; @@ -228,7 +245,26 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { numOfReplicas -= (totalNumOfReplicas-clusterSize); totalNumOfReplicas = clusterSize; } - int maxNodesPerRack = (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2; + // No calculation needed when there is only one rack or picking one node. + int numOfRacks = clusterMap.getNumOfRacks(); + if (numOfRacks == 1 || totalNumOfReplicas <= 1) { + return new int[] {numOfReplicas, totalNumOfReplicas}; + } + + int maxNodesPerRack = (totalNumOfReplicas-1)/numOfRacks + 2; + // At this point, there are more than one racks and more than one replicas + // to store. Avoid all replicas being in the same rack. + // + // maxNodesPerRack has the following properties at this stage. + // 1) maxNodesPerRack >= 2 + // 2) (maxNodesPerRack-1) * numOfRacks > totalNumOfReplicas + // when numOfRacks > 1 + // + // Thus, the following adjustment will still result in a value that forces + // multi-rack allocation and gives enough number of total nodes. + if (maxNodesPerRack == totalNumOfReplicas) { + maxNodesPerRack--; + } return new int[] {numOfReplicas, maxNodesPerRack}; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java index f557fd5aec4..0bca23d1edc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java @@ -188,7 +188,7 @@ public class TestFileAppendRestart { try { cluster = new MiniDFSCluster.Builder(conf).manageDataDfsDirs(true) .manageNameDfsDirs(true).numDataNodes(4) - .racks(new String[] { "/rack1", "/rack1", "/rack1", "/rack2" }) + .racks(new String[] { "/rack1", "/rack1", "/rack2", "/rack2" }) .build(); cluster.waitActive(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 7c0623cd468..b444ccc9134 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -281,7 +281,8 @@ public class TestBlockManager { assertTrue("Source of replication should be one of the nodes the block " + "was on. Was: " + pipeline[0], origStorages.contains(pipeline[0])); - assertEquals("Should have three targets", 3, pipeline.length); + // Only up to two nodes can be picked per rack when there are two racks. + assertEquals("Should have two targets", 2, pipeline.length); boolean foundOneOnRackB = false; for (int i = 1; i < pipeline.length; i++) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java index b7ffe74e03f..1e514af7e16 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java @@ -429,6 +429,55 @@ public class TestReplicationPolicy { assertFalse(isOnSameRack(targets[0], targets[1])); } + /** + * In this testcase, there are enough total number of nodes, but only + * one rack is actually available. + * @throws Exception + */ + @Test + public void testChooseTarget6() throws Exception { + DatanodeStorageInfo storage = DFSTestUtil.createDatanodeStorageInfo( + "DS-xxxx", "7.7.7.7", "/d2/r3", "host7"); + DatanodeDescriptor newDn = storage.getDatanodeDescriptor(); + Set excludedNodes; + List chosenNodes = new ArrayList(); + + excludedNodes = new HashSet(); + excludedNodes.add(dataNodes[0]); + excludedNodes.add(dataNodes[1]); + excludedNodes.add(dataNodes[2]); + excludedNodes.add(dataNodes[3]); + + DatanodeStorageInfo[] targets; + // Only two nodes available in a rack. Try picking two nodes. Only one + // should return. + targets = chooseTarget(2, chosenNodes, excludedNodes); + assertEquals(1, targets.length); + + // Make three nodes available in a rack. + final BlockManager bm = namenode.getNamesystem().getBlockManager(); + bm.getDatanodeManager().getNetworkTopology().add(newDn); + bm.getDatanodeManager().getHeartbeatManager().addDatanode(newDn); + updateHeartbeatWithUsage(newDn, + 2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L, + 2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L, 0L, 0L, 0, 0); + + // Try picking three nodes. Only two should return. + excludedNodes.clear(); + excludedNodes.add(dataNodes[0]); + excludedNodes.add(dataNodes[1]); + excludedNodes.add(dataNodes[2]); + excludedNodes.add(dataNodes[3]); + chosenNodes.clear(); + try { + targets = chooseTarget(3, chosenNodes, excludedNodes); + assertEquals(2, targets.length); + } finally { + bm.getDatanodeManager().getNetworkTopology().remove(newDn); + } + } + + /** * In this testcase, it tries to choose more targets than available nodes and * check the result, with stale node avoidance on the write path enabled.