HDFS-5140. Too many safemode monitor threads being created in the standby namenode causing it to fail with out of memory error. Contributed by Jing Zhao.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1518899 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jing Zhao 2013-08-30 07:36:45 +00:00
parent daa0713eb0
commit 25d4c2fd53
2 changed files with 14 additions and 7 deletions

View File

@ -407,6 +407,9 @@ Release 2.1.1-beta - UNRELEASED
HDFS-5077. NPE in FSNamesystem.commitBlockSynchronization(). HDFS-5077. NPE in FSNamesystem.commitBlockSynchronization().
(Plamen Jeliazkov via shv) (Plamen Jeliazkov via shv)
HDFS-5140. Too many safemode monitor threads being created in the standby
namenode causing it to fail with out of memory error. (jing9)
Release 2.1.0-beta - 2013-08-22 Release 2.1.0-beta - 2013-08-22
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -366,7 +366,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
final LeaseManager leaseManager = new LeaseManager(this); final LeaseManager leaseManager = new LeaseManager(this);
Daemon smmthread = null; // SafeModeMonitor thread volatile Daemon smmthread = null; // SafeModeMonitor thread
Daemon nnrmthread = null; // NamenodeResourceMonitor thread Daemon nnrmthread = null; // NamenodeResourceMonitor thread
@ -4555,7 +4555,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
// Have to have write-lock since leaving safemode initializes // Have to have write-lock since leaving safemode initializes
// repl queues, which requires write lock // repl queues, which requires write lock
assert hasWriteLock(); assert hasWriteLock();
if (needEnter()) { // if smmthread is already running, the block threshold must have been
// reached before, there is no need to enter the safe mode again
if (smmthread == null && needEnter()) {
enter(); enter();
// check if we are ready to initialize replication queues // check if we are ready to initialize replication queues
if (canInitializeReplQueues() && !isPopulatingReplQueues()) { if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
@ -4564,7 +4566,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
reportStatus("STATE* Safe mode ON.", false); reportStatus("STATE* Safe mode ON.", false);
return; return;
} }
// the threshold is reached // the threshold is reached or was reached before
if (!isOn() || // safe mode is off if (!isOn() || // safe mode is off
extension <= 0 || threshold <= 0) { // don't need to wait extension <= 0 || threshold <= 0) { // don't need to wait
this.leave(); // leave safe mode this.leave(); // leave safe mode
@ -4576,9 +4578,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
} }
// start monitor // start monitor
reached = now(); reached = now();
smmthread = new Daemon(new SafeModeMonitor()); if (smmthread == null) {
smmthread.start(); smmthread = new Daemon(new SafeModeMonitor());
reportStatus("STATE* Safe mode extension entered.", true); smmthread.start();
reportStatus("STATE* Safe mode extension entered.", true);
}
// check if we are ready to initialize replication queues // check if we are ready to initialize replication queues
if (canInitializeReplQueues() && !isPopulatingReplQueues()) { if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
@ -4814,6 +4818,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
if (safeMode.canLeave()) { if (safeMode.canLeave()) {
// Leave safe mode. // Leave safe mode.
safeMode.leave(); safeMode.leave();
smmthread = null;
break; break;
} }
} finally { } finally {
@ -4829,7 +4834,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
if (!fsRunning) { if (!fsRunning) {
LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread"); LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
} }
smmthread = null;
} }
} }