diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index c192a10ee02..ff245673cff 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -181,6 +181,9 @@ Release 2.1.1-beta - UNRELEASED HDFS-5077. NPE in FSNamesystem.commitBlockSynchronization(). (Plamen Jeliazkov via shv) + HDFS-5140. Too many safemode monitor threads being created in the standby + namenode causing it to fail with out of memory error. (jing9) + Release 2.1.0-beta - 2013-08-22 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index c9b58bf83e2..b0aed580551 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -351,7 +351,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, final LeaseManager leaseManager = new LeaseManager(this); - Daemon smmthread = null; // SafeModeMonitor thread + volatile Daemon smmthread = null; // SafeModeMonitor thread Daemon nnrmthread = null; // NamenodeResourceMonitor thread @@ -4538,7 +4538,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, // Have to have write-lock since leaving safemode initializes // repl queues, which requires write lock assert hasWriteLock(); - if (needEnter()) { + // if smmthread is already running, the block threshold must have been + // reached before, there is no need to enter the safe mode again + if (smmthread == null && needEnter()) { enter(); // check if we are ready to initialize replication queues if (canInitializeReplQueues() && !isPopulatingReplQueues()) { @@ -4547,7 +4549,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, reportStatus("STATE* Safe mode ON.", false); return; } - // the threshold is reached + // the threshold is reached or was reached before if (!isOn() || // safe mode is off extension <= 0 || threshold <= 0) { // don't need to wait this.leave(); // leave safe mode @@ -4559,9 +4561,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats, } // start monitor reached = now(); - smmthread = new Daemon(new SafeModeMonitor()); - smmthread.start(); - reportStatus("STATE* Safe mode extension entered.", true); + if (smmthread == null) { + smmthread = new Daemon(new SafeModeMonitor()); + smmthread.start(); + reportStatus("STATE* Safe mode extension entered.", true); + } // check if we are ready to initialize replication queues if (canInitializeReplQueues() && !isPopulatingReplQueues()) { @@ -4797,6 +4801,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, if (safeMode.canLeave()) { // Leave safe mode. safeMode.leave(); + smmthread = null; break; } } finally { @@ -4812,7 +4817,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats, if (!fsRunning) { LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread"); } - smmthread = null; } }