From ec9c6aaac8ebbc765010ee2d80153ec02af3763e Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Thu, 30 Jan 2014 00:00:03 +0000 Subject: [PATCH] HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1562645 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hdfs/server/namenode/CacheManager.java | 11 ++ .../hdfs/server/namenode/FSNamesystem.java | 1 + .../server/namenode/SecondaryNameNode.java | 7 +- .../server/namenode/TestCacheDirectives.java | 173 +++++++++++------- .../hdfs/server/namenode/TestCheckpoint.java | 2 +- 6 files changed, 126 insertions(+), 71 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index ecc58863b02..4316fde5aca 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -540,6 +540,9 @@ Release 2.3.0 - UNRELEASED HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be closed before method returns (Ted Yu via todd) + HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools. + (wang) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java index b3ff8dfef59..56b62cc50b7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java @@ -193,6 +193,17 @@ public final class CacheManager { } + /** + * Resets all tracked directives and pools. Called during 2NN checkpointing to + * reset FSNamesystem state. See {FSNamesystem{@link #clear()}. + */ + void clear() { + directivesById.clear(); + directivesByPath.clear(); + cachePools.clear(); + nextDirectiveId = 1; + } + public void startMonitorThread() { crmLock.lock(); try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 40e926024d9..d94cb10e9c2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -521,6 +521,7 @@ void clear() { leaseManager.removeAllLeases(); inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID); snapshotManager.clearSnapshottableDirs(); + cacheManager.clear(); } @VisibleForTesting diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java index f7592a8975b..9e83f14eba5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java @@ -1001,7 +1001,12 @@ static void doMerge( sig.mostRecentCheckpointTxId + " even though it should have " + "just been downloaded"); } - dstImage.reloadFromImageFile(file, dstNamesystem); + dstNamesystem.writeLock(); + try { + dstImage.reloadFromImageFile(file, dstNamesystem); + } finally { + dstNamesystem.writeUnlock(); + } dstNamesystem.dir.imageLoadComplete(); } // error simulation code for junit test diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java index d47c275771f..482408bb056 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java @@ -69,6 +69,7 @@ import org.apache.hadoop.hdfs.protocol.CachePoolStats; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type; import org.apache.hadoop.hdfs.server.datanode.DataNode; @@ -528,77 +529,111 @@ public void testAddRemoveDirectives() throws Exception { @Test(timeout=60000) public void testCacheManagerRestart() throws Exception { - // Create and validate a pool - final String pool = "poolparty"; - String groupName = "partygroup"; - FsPermission mode = new FsPermission((short)0777); - long limit = 747; - dfs.addCachePool(new CachePoolInfo(pool) - .setGroupName(groupName) - .setMode(mode) - .setLimit(limit)); - RemoteIterator pit = dfs.listCachePools(); - assertTrue("No cache pools found", pit.hasNext()); - CachePoolInfo info = pit.next().getInfo(); - assertEquals(pool, info.getPoolName()); - assertEquals(groupName, info.getGroupName()); - assertEquals(mode, info.getMode()); - assertEquals(limit, (long)info.getLimit()); - assertFalse("Unexpected # of cache pools found", pit.hasNext()); + SecondaryNameNode secondary = null; + try { + // Start a secondary namenode + conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, + "0.0.0.0:0"); + secondary = new SecondaryNameNode(conf); - // Create some cache entries - int numEntries = 10; - String entryPrefix = "/party-"; - long prevId = -1; - final Date expiry = new Date(); - for (int i=0; i dit - = dfs.listCacheDirectives(null); - for (int i=0; i pit = dfs.listCachePools(); + assertTrue("No cache pools found", pit.hasNext()); + CachePoolInfo info = pit.next().getInfo(); + assertEquals(pool, info.getPoolName()); + assertEquals(groupName, info.getGroupName()); + assertEquals(mode, info.getMode()); + assertEquals(limit, (long)info.getLimit()); + assertFalse("Unexpected # of cache pools found", pit.hasNext()); + + // Create some cache entries + int numEntries = 10; + String entryPrefix = "/party-"; + long prevId = -1; + final Date expiry = new Date(); + for (int i=0; i dit + = dfs.listCacheDirectives(null); + for (int i=0; i