From 1354ec1c74423048bee04ea2472e481f5e4f8095 Mon Sep 17 00:00:00 2001
From: Kihwal Lee <kihwal@apache.org>
Date: Wed, 29 Oct 2014 17:25:51 -0500
Subject: [PATCH] HDFS-7300. The getMaxNodesPerRack() method in
 BlockPlacementPolicyDefault is flawed. contributed by Kihwal Lee (cherry
 picked from commit 3ae84e1ba8928879b3eda90e79667ba5a45d60f8)

---
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt   |  4 ++
 .../BlockPlacementPolicyDefault.java          | 42 ++++++++++++++--
 .../hadoop/hdfs/TestFileAppendRestart.java    |  2 +-
 .../blockmanagement/TestBlockManager.java     |  3 +-
 .../TestReplicationPolicy.java                | 49 +++++++++++++++++++
 5 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 7e465ae35a8..bc396b84786 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -853,6 +853,10 @@ Release 2.6.0 - UNRELEASED
     HDFS-7287. The OfflineImageViewer (OIV) can output invalid XML depending on
     the filename (Ravi Prakash via Colin P. McCabe)
 
+    HDFS-7300. The getMaxNodesPerRack() method in BlockPlacementPolicyDefault
+    is flawed (kihwal)
+
+
   BREAKDOWN OF HDFS-6584 ARCHIVAL STORAGE
 
     HDFS-6677. Change INodeFile and FSImage to support storage policy ID.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
index 99f509e8d04..5b02384737b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
@@ -139,13 +139,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
       List<DatanodeStorageInfo> results = new ArrayList<DatanodeStorageInfo>();
       boolean avoidStaleNodes = stats != null
           && stats.isAvoidingStaleDataNodesForWrite();
+
+      int maxNodesAndReplicas[] = getMaxNodesPerRack(0, numOfReplicas);
+      numOfReplicas = maxNodesAndReplicas[0];
+      int maxNodesPerRack = maxNodesAndReplicas[1];
+
       for (int i = 0; i < favoredNodes.size() && results.size() < numOfReplicas; i++) {
         DatanodeDescriptor favoredNode = favoredNodes.get(i);
         // Choose a single node which is local to favoredNode.
         // 'results' is updated within chooseLocalNode
         final DatanodeStorageInfo target = chooseLocalStorage(favoredNode,
-            favoriteAndExcludedNodes, blocksize, 
-            getMaxNodesPerRack(results.size(), numOfReplicas)[1],
+            favoriteAndExcludedNodes, blocksize, maxNodesPerRack,
             results, avoidStaleNodes, storageTypes, false);
         if (target == null) {
           LOG.warn("Could not find a target for file " + src
@@ -221,6 +225,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
         results.toArray(new DatanodeStorageInfo[results.size()]));
   }
 
+  /**
+   * Calculate the maximum number of replicas to allocate per rack. It also
+   * limits the total number of replicas to the total number of nodes in the
+   * cluster. Caller should adjust the replica count to the return value.
+   *
+   * @param numOfChosen The number of already chosen nodes.
+   * @param numOfReplicas The number of additional nodes to allocate.
+   * @return integer array. Index 0: The number of nodes allowed to allocate
+   *         in addition to already chosen nodes.
+   *         Index 1: The maximum allowed number of nodes per rack. This
+   *         is independent of the number of chosen nodes, as it is calculated
+   *         using the target number of replicas.
+   */
   private int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) {
     int clusterSize = clusterMap.getNumOfLeaves();
     int totalNumOfReplicas = numOfChosen + numOfReplicas;
@@ -228,7 +245,26 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
       numOfReplicas -= (totalNumOfReplicas-clusterSize);
       totalNumOfReplicas = clusterSize;
     }
-    int maxNodesPerRack = (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;
+    // No calculation needed when there is only one rack or picking one node.
+    int numOfRacks = clusterMap.getNumOfRacks();
+    if (numOfRacks == 1 || totalNumOfReplicas <= 1) {
+      return new int[] {numOfReplicas, totalNumOfReplicas};
+    }
+
+    int maxNodesPerRack = (totalNumOfReplicas-1)/numOfRacks + 2;
+    // At this point, there are more than one racks and more than one replicas
+    // to store. Avoid all replicas being in the same rack.
+    //
+    // maxNodesPerRack has the following properties at this stage.
+    //   1) maxNodesPerRack >= 2
+    //   2) (maxNodesPerRack-1) * numOfRacks > totalNumOfReplicas
+    //          when numOfRacks > 1
+    //
+    // Thus, the following adjustment will still result in a value that forces
+    // multi-rack allocation and gives enough number of total nodes.
+    if (maxNodesPerRack == totalNumOfReplicas) {
+      maxNodesPerRack--;
+    }
     return new int[] {numOfReplicas, maxNodesPerRack};
   }
 
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
index f557fd5aec4..0bca23d1edc 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
@@ -188,7 +188,7 @@ public class TestFileAppendRestart {
     try {
       cluster = new MiniDFSCluster.Builder(conf).manageDataDfsDirs(true)
           .manageNameDfsDirs(true).numDataNodes(4)
-          .racks(new String[] { "/rack1", "/rack1", "/rack1", "/rack2" })
+          .racks(new String[] { "/rack1", "/rack1", "/rack2", "/rack2" })
           .build();
       cluster.waitActive();
 
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
index 7c0623cd468..b444ccc9134 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
@@ -281,7 +281,8 @@ public class TestBlockManager {
     assertTrue("Source of replication should be one of the nodes the block " +
         "was on. Was: " + pipeline[0],
         origStorages.contains(pipeline[0]));
-    assertEquals("Should have three targets", 3, pipeline.length);
+    // Only up to two nodes can be picked per rack when there are two racks.
+    assertEquals("Should have two targets", 2, pipeline.length);
     
     boolean foundOneOnRackB = false;
     for (int i = 1; i < pipeline.length; i++) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
index b7ffe74e03f..1e514af7e16 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
@@ -429,6 +429,55 @@ public class TestReplicationPolicy {
     assertFalse(isOnSameRack(targets[0], targets[1]));
   }
 
+  /**
+   * In this testcase, there are enough total number of nodes, but only
+   * one rack is actually available.
+   * @throws Exception
+   */
+  @Test
+  public void testChooseTarget6() throws Exception {
+    DatanodeStorageInfo storage = DFSTestUtil.createDatanodeStorageInfo(
+        "DS-xxxx", "7.7.7.7", "/d2/r3", "host7");
+    DatanodeDescriptor newDn = storage.getDatanodeDescriptor();
+    Set<Node> excludedNodes;
+    List<DatanodeStorageInfo> chosenNodes = new ArrayList<DatanodeStorageInfo>();
+
+    excludedNodes = new HashSet<Node>();
+    excludedNodes.add(dataNodes[0]);
+    excludedNodes.add(dataNodes[1]);
+    excludedNodes.add(dataNodes[2]);
+    excludedNodes.add(dataNodes[3]);
+
+    DatanodeStorageInfo[] targets;
+    // Only two nodes available in a rack. Try picking two nodes. Only one
+    // should return.
+    targets = chooseTarget(2, chosenNodes, excludedNodes);
+    assertEquals(1, targets.length);
+
+    // Make three nodes available in a rack.
+    final BlockManager bm = namenode.getNamesystem().getBlockManager();
+    bm.getDatanodeManager().getNetworkTopology().add(newDn);
+    bm.getDatanodeManager().getHeartbeatManager().addDatanode(newDn);
+    updateHeartbeatWithUsage(newDn,
+        2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L,
+        2*HdfsConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L, 0L, 0L, 0, 0);
+
+    // Try picking three nodes. Only two should return.
+    excludedNodes.clear();
+    excludedNodes.add(dataNodes[0]);
+    excludedNodes.add(dataNodes[1]);
+    excludedNodes.add(dataNodes[2]);
+    excludedNodes.add(dataNodes[3]);
+    chosenNodes.clear();
+    try {
+      targets = chooseTarget(3, chosenNodes, excludedNodes);
+      assertEquals(2, targets.length);
+    } finally {
+      bm.getDatanodeManager().getNetworkTopology().remove(newDn);
+    }
+  }
+
+
   /**
    * In this testcase, it tries to choose more targets than available nodes and
    * check the result, with stale node avoidance on the write path enabled.