HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby. Contributed by yunjiong zhao and Wei-Chiu Chuang.

(cherry picked from commit be488b6070) (cherry picked from commit c8703dda07)
2019-04-03 11:00:12 -07:00 · 2019-04-03 11:00:12 -07:00 · 084fb9de22
parent 875435dc7d
commit 084fb9de22
1 changed files with 32 additions and 12 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@ -4186,8 +4186,15 @@ public class BlockManager implements BlockStatsMXBean {
    if (!isPopulatingReplQueues()) {
      return;
    }
-    final Iterator<BlockInfo> it = srcNode.getBlockIterator();
+
    int numExtraRedundancy = 0;
+    for (DatanodeStorageInfo datanodeStorageInfo : srcNode.getStorageInfos()) {
+      // the namesystem lock is released between iterations. Make sure the
+      // storage is not removed before continuing.
+      if (srcNode.getStorageInfo(datanodeStorageInfo.getStorageID()) == null) {
+        continue;
+      }
+      final Iterator<BlockInfo> it = datanodeStorageInfo.getBlockIterator();
      while(it.hasNext()) {
        final BlockInfo block = it.next();
        if (block.isDeleted()) {
@ -4203,6 +4210,19 @@ public class BlockManager implements BlockStatsMXBean {
          numExtraRedundancy++;
        }
      }
+      // When called by tests like TestDefaultBlockPlacementPolicy.
+      // testPlacementWithLocalRackNodesDecommissioned, it is not protected by
+      // lock, only when called by DatanodeManager.refreshNodes have writeLock
+      if (namesystem.hasWriteLock()) {
+        namesystem.writeUnlock();
+        try {
+          Thread.sleep(1);
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+        }
+        namesystem.writeLock();
+      }
+    }
    LOG.info("Invalidated {} extra redundancy blocks on {} after "
             + "it is in service", numExtraRedundancy, srcNode);
  }