From 27d3a51b56f49cdc2ba4ed0703b1a99751bd3ebd Mon Sep 17 00:00:00 2001 From: Kihwal Lee Date: Fri, 7 Jun 2013 20:05:17 +0000 Subject: [PATCH] svn merge -c 1490803 Merging from trunk to branch-2 to fix HDFS-4832. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1490805 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../server/blockmanagement/BlockManager.java | 2 +- .../blockmanagement/DatanodeManager.java | 8 +++- .../blockmanagement/HeartbeatManager.java | 4 +- .../hdfs/server/namenode/FSNamesystem.java | 9 ++++- .../server/namenode/TestFSNamesystem.java | 38 +++++++++++++++++++ 6 files changed, 58 insertions(+), 6 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 5cef345c174..9132a7ef810 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -2940,6 +2940,9 @@ Release 0.23.9 - UNRELEASED HDFS-4862. SafeModeInfo.isManual() returns true when resources are low even if it wasn't entered into manually (Ravi Prakash via kihwal) + HDFS-4832. Namenode doesn't change the number of missing blocks in + safemode when DNs rejoin or leave (Ravi Prakash via kihwal) + Release 0.23.8 - 2013-06-05 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 132ca68e03d..4e082b702bc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2156,7 +2156,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block return storedBlock; } - // do not try to handle over/under-replicated blocks during safe mode + // do not try to handle over/under-replicated blocks during first safe mode if (!namesystem.isPopulatingReplQueues()) { return storedBlock; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index d20c50b0e18..99054b3aae8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -1190,7 +1190,13 @@ public class DatanodeManager { heartbeatManager.updateHeartbeat(nodeinfo, capacity, dfsUsed, remaining, blockPoolUsed, xceiverCount, failedVolumes); - + + // If we are in safemode, do not send back any recovery / replication + // requests. Don't even drain the existing queue of work. + if(namesystem.isInSafeMode()) { + return new DatanodeCommand[0]; + } + //check lease recovery BlockInfoUnderConstruction[] blocks = nodeinfo .getLeaseRecoveryCommand(Integer.MAX_VALUE); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index a033da36fbc..0bff1bf52f7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -223,7 +223,7 @@ class HeartbeatManager implements DatanodeStatistics { final DatanodeManager dm = blockManager.getDatanodeManager(); // It's OK to check safe mode w/o taking the lock here, we re-check // for safe mode after taking the lock before removing a datanode. - if (namesystem.isInSafeMode()) { + if (namesystem.isInStartupSafeMode()) { return; } boolean allAlive = false; @@ -252,7 +252,7 @@ class HeartbeatManager implements DatanodeStatistics { // acquire the fsnamesystem lock, and then remove the dead node. namesystem.writeLock(); try { - if (namesystem.isInSafeMode()) { + if (namesystem.isInStartupSafeMode()) { return; } synchronized(this) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 453453539ee..907a41db3c1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -4076,7 +4076,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * * @see SafeModeInfo */ - private SafeModeInfo(boolean resourcesLow) { + private SafeModeInfo(boolean resourcesLow, boolean isReplQueuesInited) { this.threshold = 1.5f; // this threshold can never be reached this.datanodeThreshold = Integer.MAX_VALUE; this.extension = Integer.MAX_VALUE; @@ -4085,6 +4085,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, this.blockTotal = -1; this.blockSafe = -1; this.resourcesLow = resourcesLow; + this.initializedReplQueues = isReplQueuesInited; enter(); reportStatus("STATE* Safe mode is ON.", true); } @@ -4510,6 +4511,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats, && safeMode.isOn(); } + /** + * Check if replication queues are to be populated + * @return true when node is HAState.Active and not in the very first safemode + */ @Override public boolean isPopulatingReplQueues() { if (!shouldPopulateReplQueues()) { @@ -4640,7 +4645,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, getEditLog().logSyncAll(); } if (!isInSafeMode()) { - safeMode = new SafeModeInfo(resourcesLow); + safeMode = new SafeModeInfo(resourcesLow, isPopulatingReplQueues()); return; } if (resourcesLow) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java index fcb2086886b..01ea90a32dd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java @@ -34,9 +34,12 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; +import org.apache.hadoop.hdfs.server.namenode.ha.HAContext; +import org.apache.hadoop.hdfs.server.namenode.ha.HAState; import org.junit.After; import org.junit.Test; import org.mockito.Mockito; +import org.mockito.internal.util.reflection.Whitebox; public class TestFSNamesystem { @@ -104,4 +107,39 @@ public class TestFSNamesystem { assertTrue("After entering safemode due to low resources FSNamesystem." + "isInSafeMode still returned false", fsn.isInSafeMode()); } + + @Test + public void testReplQueuesActiveAfterStartupSafemode() throws IOException, InterruptedException{ + Configuration conf = new Configuration(); + + FSEditLog fsEditLog = Mockito.mock(FSEditLog.class); + FSImage fsImage = Mockito.mock(FSImage.class); + Mockito.when(fsImage.getEditLog()).thenReturn(fsEditLog); + + FSNamesystem fsNamesystem = new FSNamesystem(conf, fsImage); + FSNamesystem fsn = Mockito.spy(fsNamesystem); + + //Make shouldPopulaeReplQueues return true + HAContext haContext = Mockito.mock(HAContext.class); + HAState haState = Mockito.mock(HAState.class); + Mockito.when(haContext.getState()).thenReturn(haState); + Mockito.when(haState.shouldPopulateReplQueues()).thenReturn(true); + Whitebox.setInternalState(fsn, "haContext", haContext); + + //Make NameNode.getNameNodeMetrics() not return null + NameNode.initMetrics(conf, NamenodeRole.NAMENODE); + + fsn.enterSafeMode(false); + assertTrue("FSNamesystem didn't enter safemode", fsn.isInSafeMode()); + assertTrue("Replication queues were being populated during very first " + + "safemode", !fsn.isPopulatingReplQueues()); + fsn.leaveSafeMode(); + assertTrue("FSNamesystem didn't leave safemode", !fsn.isInSafeMode()); + assertTrue("Replication queues weren't being populated even after leaving " + + "safemode", fsn.isPopulatingReplQueues()); + fsn.enterSafeMode(false); + assertTrue("FSNamesystem didn't enter safemode", fsn.isInSafeMode()); + assertTrue("Replication queues weren't being populated after entering " + + "safemode 2nd time", fsn.isPopulatingReplQueues()); + } }