From 616ed9084be4dc337c2ececa7aecc4ab7899a75a Mon Sep 17 00:00:00 2001 From: Kihwal Lee Date: Thu, 5 Nov 2015 09:27:36 -0600 Subject: [PATCH] HDFS-4937. ReplicationMonitor can infinite-loop in BlockPlacementPolicyDefault#chooseRandom(). Contributed by Kihwal Lee. (cherry picked from commit ff47f35deed14ba6463cba76f0e6a6c15abb3eca) --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +++ .../blockmanagement/BlockPlacementPolicyDefault.java | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 45ce3107a03..3e62c5d0567 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -15,6 +15,9 @@ Release 2.7.3 - UNRELEASED HDFS-9289. Make DataStreamer#block thread safe and verify genStamp in commitBlock. (Chang Li via kihwal) + HDFS-4937. ReplicationMonitor can infinite-loop in + BlockPlacementPolicyDefault#chooseRandom(). (kihwal) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java index 97ea7829c72..93056e94084 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java @@ -622,6 +622,7 @@ protected DatanodeStorageInfo chooseRandom(int numOfReplicas, int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes( scope, excludedNodes); + int refreshCounter = numOfAvailableNodes; StringBuilder builder = null; if (LOG.isDebugEnabled()) { builder = debugLoggingBuilder.get(); @@ -675,6 +676,14 @@ protected DatanodeStorageInfo chooseRandom(int numOfReplicas, // If no candidate storage was found on this DN then set badTarget. badTarget = (i == storages.length); } + // Refresh the node count. If the live node count became smaller, + // but it is not reflected in this loop, it may loop forever in case + // the replicas/rack cannot be satisfied. + if (--refreshCounter == 0) { + numOfAvailableNodes = clusterMap.countNumOfAvailableNodes(scope, + excludedNodes); + refreshCounter = numOfAvailableNodes; + } } if (numOfReplicas>0) {