From 465663afbf0463f8e91d978993f7d46ea9c48dc2 Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Mon, 23 Jan 2012 22:22:46 +0000 Subject: [PATCH] HDFS-2804. Should not mark blocks under-replicated when exiting safemode. Contributed by Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1235033 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-hdfs/CHANGES.HDFS-1623.txt | 2 + .../hdfs/server/namenode/FSNamesystem.java | 21 +++++++-- .../server/namenode/ha/TestHASafeMode.java | 46 +++++++++++++++++++ 3 files changed, 64 insertions(+), 5 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt index b002f411614..b908da82b43 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt @@ -125,3 +125,5 @@ HDFS-2737. Automatically trigger log rolls periodically on the active NN. (todd HDFS-2820. Add a simple sanity check for HA config (todd) HDFS-2688. Add tests for quota tracking in an HA cluster. (todd) + +HDFS-2804. Should not mark blocks under-replicated when exiting safemode (todd) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 3aa9e60bed3..80b05d66c7f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -848,6 +848,17 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return fsRunning; } + private boolean isInStandbyState() { + if (haContext == null || haContext.getState() == null) { + // We're still starting up. In this case, if HA is + // on for the cluster, we always start in standby. Otherwise + // start in active. + return haEnabled; + } + + return haContext.getState() instanceof StandbyState; + } + /** * Dump all metadata into specified file */ @@ -3345,8 +3356,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return; } } - // if not done yet, initialize replication queues - if (!isPopulatingReplQueues()) { + // if not done yet, initialize replication queues. + // In the standby, do not populate repl queues + if (!isPopulatingReplQueues() && !isInStandbyState()) { initializeReplQueues(); } long timeInSafemode = now() - systemStart; @@ -3389,7 +3401,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, * initializing replication queues. */ private synchronized boolean canInitializeReplQueues() { - return blockSafe >= blockReplQueueThreshold; + return !isInStandbyState() && blockSafe >= blockReplQueueThreshold; } /** @@ -3705,8 +3717,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, @Override public boolean isPopulatingReplQueues() { - if (haContext != null && // null during startup! - !haContext.getState().shouldPopulateReplQueues()) { + if (isInStandbyState()) { return false; } // safeMode is volatile, and may be set to null at any time diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java index 0703f8c8a02..d423ce26617 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hdfs.server.namenode.ha; +import static org.junit.Assert.*; import static org.junit.Assert.assertTrue; import static org.mockito.Matchers.anyInt; import static org.mockito.Mockito.mock; @@ -32,16 +33,21 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.hadoop.test.GenericTestUtils; import org.junit.After; import org.junit.Before; import org.junit.Test; +import com.google.common.base.Supplier; + /** * Tests that exercise safemode in an HA cluster. */ @@ -59,6 +65,8 @@ public class TestHASafeMode { Configuration conf = new Configuration(); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); + cluster = new MiniDFSCluster.Builder(conf) .nnTopology(MiniDFSNNTopology.simpleHATopology()) .numDataNodes(3) @@ -425,6 +433,44 @@ public class TestHASafeMode { "total blocks 6. Safe mode will be turned off automatically")); } + /** + * Regression test for HDFS-2804: standby should not populate replication + * queues when exiting safe mode. + */ + @Test + public void testNoPopulatingReplQueuesWhenExitingSafemode() throws Exception { + DFSTestUtil.createFile(fs, new Path("/test"), 15*BLOCK_SIZE, (short)3, 1L); + + HATestUtil.waitForStandbyToCatchUp(nn0, nn1); + + // get some blocks in the SBN's image + nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_ENTER); + NameNodeAdapter.saveNamespace(nn1); + nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_LEAVE); + + // and some blocks in the edit logs + DFSTestUtil.createFile(fs, new Path("/test2"), 15*BLOCK_SIZE, (short)3, 1L); + nn0.getRpcServer().rollEditLog(); + + cluster.stopDataNode(1); + cluster.shutdownNameNode(1); + + //Configuration sbConf = cluster.getConfiguration(1); + //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1); + cluster.restartNameNode(1, false); + nn1 = cluster.getNameNode(1); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + return !nn1.isInSafeMode(); + } + }, 100, 10000); + + BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); + assertEquals(0L, nn1.getNamesystem().getUnderReplicatedBlocks()); + assertEquals(0L, nn1.getNamesystem().getPendingReplicationBlocks()); + } + /** * Print a big banner in the test log to make debug easier. */