From 1dd48b1aee2378c02ee7e78864a757cff3607274 Mon Sep 17 00:00:00 2001 From: Tsz-wo Sze Date: Thu, 11 Aug 2011 22:16:16 +0000 Subject: [PATCH] HDFS-2229. Fix a deadlock in namenode by enforcing lock acquisition ordering. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1156847 13f79535-47bb-0310-9956-ffa450edef68 --- hdfs/CHANGES.txt | 3 + .../server/blockmanagement/BlockManager.java | 56 +++++++++---------- .../hdfs/server/namenode/FSNamesystem.java | 19 ++++--- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/hdfs/CHANGES.txt b/hdfs/CHANGES.txt index ff0ba15b050..fc2d3e5a1bd 100644 --- a/hdfs/CHANGES.txt +++ b/hdfs/CHANGES.txt @@ -954,6 +954,9 @@ Trunk (unreleased changes) HDFS-2245. Fix a NullPointerException in BlockManager.chooseTarget(..). (szetszwo) + HDFS-2229. Fix a deadlock in namenode by enforcing lock acquisition + ordering. (szetszwo) + BREAKDOWN OF HDFS-1073 SUBTASKS HDFS-1521. Persist transaction ID on disk between NN restarts. diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index f60530b4cc4..19e604c3a73 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -1829,39 +1829,37 @@ public class BlockManager { * over or under replicated. Place it into the respective queue. */ public void processMisReplicatedBlocks() { - long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0; - namesystem.writeLock(); - try { - neededReplications.clear(); - for (BlockInfo block : blocksMap.getBlocks()) { - INodeFile fileINode = block.getINode(); - if (fileINode == null) { - // block does not belong to any file - nrInvalid++; - addToInvalidates(block); - continue; - } - // calculate current replication - short expectedReplication = fileINode.getReplication(); - NumberReplicas num = countNodes(block); - int numCurrentReplica = num.liveReplicas(); - // add to under-replicated queue if need to be - if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { - if (neededReplications.add(block, numCurrentReplica, num - .decommissionedReplicas(), expectedReplication)) { - nrUnderReplicated++; - } - } + assert namesystem.hasWriteLock(); - if (numCurrentReplica > expectedReplication) { - // over-replicated block - nrOverReplicated++; - processOverReplicatedBlock(block, expectedReplication, null, null); + long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0; + neededReplications.clear(); + for (BlockInfo block : blocksMap.getBlocks()) { + INodeFile fileINode = block.getINode(); + if (fileINode == null) { + // block does not belong to any file + nrInvalid++; + addToInvalidates(block); + continue; + } + // calculate current replication + short expectedReplication = fileINode.getReplication(); + NumberReplicas num = countNodes(block); + int numCurrentReplica = num.liveReplicas(); + // add to under-replicated queue if need to be + if (isNeededReplication(block, expectedReplication, numCurrentReplica)) { + if (neededReplications.add(block, numCurrentReplica, num + .decommissionedReplicas(), expectedReplication)) { + nrUnderReplicated++; } } - } finally { - namesystem.writeUnlock(); + + if (numCurrentReplica > expectedReplication) { + // over-replicated block + nrOverReplicated++; + processOverReplicatedBlock(block, expectedReplication, null, null); + } } + LOG.info("Total number of blocks = " + blocksMap.size()); LOG.info("Number of invalid blocks = " + nrInvalid); LOG.info("Number of under-replicated blocks = " + nrUnderReplicated); diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 5ddd92b4058..d4cb0f8ae51 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -313,14 +313,19 @@ public class FSNamesystem implements RwLock, FSClusterStats, * Activate FSNamesystem daemons. */ void activate(Configuration conf) throws IOException { - setBlockTotal(); - blockManager.activate(conf); - this.lmthread = new Daemon(leaseManager.new Monitor()); - lmthread.start(); - - this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); - nnrmthread.start(); + writeLock(); + try { + setBlockTotal(); + blockManager.activate(conf); + this.lmthread = new Daemon(leaseManager.new Monitor()); + lmthread.start(); + this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); + nnrmthread.start(); + } finally { + writeUnlock(); + } + registerMXBean(); DefaultMetricsSystem.instance().register(this); }