HDFS-2914. HA: Standby should not enter safemode when resources are low. Contributed by Vinay.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1347898 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2012-06-08 07:04:58 +00:00
parent fb52b5ec9e
commit 4757836b75
3 changed files with 29 additions and 5 deletions

View File

@ -165,6 +165,8 @@ Release 2.0.1-alpha - UNRELEASED
HDFS-3485. DataTransferThrottler will over-throttle when currentTimeMillis HDFS-3485. DataTransferThrottler will over-throttle when currentTimeMillis
jumps (Andy Isaacson via todd) jumps (Andy Isaacson via todd)
HDFS-2914. HA: Standby should not enter safemode when resources are low. (Vinay via atm)
BREAKDOWN OF HDFS-3042 SUBTASKS BREAKDOWN OF HDFS-3042 SUBTASKS
HDFS-2185. HDFS portion of ZK-based FailoverController (todd) HDFS-2185. HDFS portion of ZK-based FailoverController (todd)

View File

@ -557,8 +557,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
!safeMode.isPopulatingReplQueues(); !safeMode.isPopulatingReplQueues();
setBlockTotal(); setBlockTotal();
blockManager.activate(conf); blockManager.activate(conf);
this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
nnrmthread.start();
} finally { } finally {
writeUnlock(); writeUnlock();
} }
@ -575,7 +573,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
writeLock(); writeLock();
try { try {
if (blockManager != null) blockManager.close(); if (blockManager != null) blockManager.close();
if (nnrmthread != null) nnrmthread.interrupt();
} finally { } finally {
writeUnlock(); writeUnlock();
} }
@ -629,6 +626,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
} }
leaseManager.startMonitor(); leaseManager.startMonitor();
startSecretManagerIfNecessary(); startSecretManagerIfNecessary();
//ResourceMonitor required only at ActiveNN. See HDFS-2914
this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
nnrmthread.start();
} finally { } finally {
writeUnlock(); writeUnlock();
} }
@ -651,6 +652,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
if (leaseManager != null) { if (leaseManager != null) {
leaseManager.stopMonitor(); leaseManager.stopMonitor();
} }
if (nnrmthread != null) {
((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
nnrmthread.interrupt();
}
if (dir != null && dir.fsImage != null) { if (dir != null && dir.fsImage != null) {
if (dir.fsImage.editLog != null) { if (dir.fsImage.editLog != null) {
dir.fsImage.editLog.close(); dir.fsImage.editLog.close();
@ -3178,10 +3183,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
* acceptable levels, this daemon will cause the NN to exit safe mode. * acceptable levels, this daemon will cause the NN to exit safe mode.
*/ */
class NameNodeResourceMonitor implements Runnable { class NameNodeResourceMonitor implements Runnable {
boolean shouldNNRmRun = true;
@Override @Override
public void run () { public void run () {
try { try {
while (fsRunning) { while (fsRunning && shouldNNRmRun) {
checkAvailableResources(); checkAvailableResources();
if(!nameNodeHasResourcesAvailable()) { if(!nameNodeHasResourcesAvailable()) {
String lowResourcesMsg = "NameNode low on available disk space. "; String lowResourcesMsg = "NameNode low on available disk space. ";
@ -3202,7 +3208,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e); FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
} }
} }
}
public void stopMonitor() {
shouldNNRmRun = false;
}
}
public FSImage getFSImage() { public FSImage getFSImage() {
return dir.fsImage; return dir.fsImage;

View File

@ -17,6 +17,8 @@
*/ */
package org.apache.hadoop.hdfs.server.namenode.ha; package org.apache.hadoop.hdfs.server.namenode.ha;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import java.io.File; import java.io.File;
@ -127,6 +129,7 @@ public class TestFailureOfSharedDir {
@Test @Test
public void testFailureOfSharedDir() throws Exception { public void testFailureOfSharedDir() throws Exception {
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.setLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, 2000);
// The shared edits dir will automatically be marked required. // The shared edits dir will automatically be marked required.
MiniDFSCluster cluster = null; MiniDFSCluster cluster = null;
@ -151,6 +154,15 @@ public class TestFailureOfSharedDir {
assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w", assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
true)); true));
Thread.sleep(conf.getLong(DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT) * 2);
NameNode nn1 = cluster.getNameNode(1);
assertTrue(nn1.isStandbyState());
assertFalse(
"StandBy NameNode should not go to SafeMode on resource unavailability",
nn1.isInSafeMode());
NameNode nn0 = cluster.getNameNode(0); NameNode nn0 = cluster.getNameNode(0);
nn0.getNamesystem().getFSImage().getEditLog().getJournalSet() nn0.getNamesystem().getFSImage().getEditLog().getJournalSet()
.setRuntimeForTesting(mockRuntime); .setRuntimeForTesting(mockRuntime);