HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1562645 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrew Wang 2014-01-30 00:00:03 +00:00
parent 3e0bbcb688
commit ec9c6aaac8
6 changed files with 126 additions and 71 deletions

View File

@ -540,6 +540,9 @@ Release 2.3.0 - UNRELEASED
HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be
closed before method returns (Ted Yu via todd) closed before method returns (Ted Yu via todd)
HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools.
(wang)
BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
HDFS-4985. Add storage type to the protocol and expose it in block report HDFS-4985. Add storage type to the protocol and expose it in block report

View File

@ -193,6 +193,17 @@ public final class CacheManager {
} }
/**
* Resets all tracked directives and pools. Called during 2NN checkpointing to
* reset FSNamesystem state. See {FSNamesystem{@link #clear()}.
*/
void clear() {
directivesById.clear();
directivesByPath.clear();
cachePools.clear();
nextDirectiveId = 1;
}
public void startMonitorThread() { public void startMonitorThread() {
crmLock.lock(); crmLock.lock();
try { try {

View File

@ -521,6 +521,7 @@ void clear() {
leaseManager.removeAllLeases(); leaseManager.removeAllLeases();
inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID); inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
snapshotManager.clearSnapshottableDirs(); snapshotManager.clearSnapshottableDirs();
cacheManager.clear();
} }
@VisibleForTesting @VisibleForTesting

View File

@ -1001,7 +1001,12 @@ static void doMerge(
sig.mostRecentCheckpointTxId + " even though it should have " + sig.mostRecentCheckpointTxId + " even though it should have " +
"just been downloaded"); "just been downloaded");
} }
dstImage.reloadFromImageFile(file, dstNamesystem); dstNamesystem.writeLock();
try {
dstImage.reloadFromImageFile(file, dstNamesystem);
} finally {
dstNamesystem.writeUnlock();
}
dstNamesystem.dir.imageLoadComplete(); dstNamesystem.dir.imageLoadComplete();
} }
// error simulation code for junit test // error simulation code for junit test

View File

@ -69,6 +69,7 @@
import org.apache.hadoop.hdfs.protocol.CachePoolStats; import org.apache.hadoop.hdfs.protocol.CachePoolStats;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode;
@ -528,77 +529,111 @@ public void testAddRemoveDirectives() throws Exception {
@Test(timeout=60000) @Test(timeout=60000)
public void testCacheManagerRestart() throws Exception { public void testCacheManagerRestart() throws Exception {
// Create and validate a pool SecondaryNameNode secondary = null;
final String pool = "poolparty"; try {
String groupName = "partygroup"; // Start a secondary namenode
FsPermission mode = new FsPermission((short)0777); conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY,
long limit = 747; "0.0.0.0:0");
dfs.addCachePool(new CachePoolInfo(pool) secondary = new SecondaryNameNode(conf);
.setGroupName(groupName)
.setMode(mode)
.setLimit(limit));
RemoteIterator<CachePoolEntry> pit = dfs.listCachePools();
assertTrue("No cache pools found", pit.hasNext());
CachePoolInfo info = pit.next().getInfo();
assertEquals(pool, info.getPoolName());
assertEquals(groupName, info.getGroupName());
assertEquals(mode, info.getMode());
assertEquals(limit, (long)info.getLimit());
assertFalse("Unexpected # of cache pools found", pit.hasNext());
// Create some cache entries // Create and validate a pool
int numEntries = 10; final String pool = "poolparty";
String entryPrefix = "/party-"; String groupName = "partygroup";
long prevId = -1; FsPermission mode = new FsPermission((short)0777);
final Date expiry = new Date(); long limit = 747;
for (int i=0; i<numEntries; i++) { dfs.addCachePool(new CachePoolInfo(pool)
prevId = dfs.addCacheDirective( .setGroupName(groupName)
new CacheDirectiveInfo.Builder(). .setMode(mode)
setPath(new Path(entryPrefix + i)).setPool(pool). .setLimit(limit));
setExpiration( RemoteIterator<CachePoolEntry> pit = dfs.listCachePools();
CacheDirectiveInfo.Expiration.newAbsolute(expiry.getTime())). assertTrue("No cache pools found", pit.hasNext());
build()); CachePoolInfo info = pit.next().getInfo();
} assertEquals(pool, info.getPoolName());
RemoteIterator<CacheDirectiveEntry> dit assertEquals(groupName, info.getGroupName());
= dfs.listCacheDirectives(null); assertEquals(mode, info.getMode());
for (int i=0; i<numEntries; i++) { assertEquals(limit, (long)info.getLimit());
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext()); assertFalse("Unexpected # of cache pools found", pit.hasNext());
CacheDirectiveInfo cd = dit.next().getInfo();
assertEquals(i+1, cd.getId().longValue()); // Create some cache entries
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath()); int numEntries = 10;
assertEquals(pool, cd.getPool()); String entryPrefix = "/party-";
} long prevId = -1;
assertFalse("Unexpected # of cache directives found", dit.hasNext()); final Date expiry = new Date();
for (int i=0; i<numEntries; i++) {
// Restart namenode prevId = dfs.addCacheDirective(
cluster.restartNameNode(); new CacheDirectiveInfo.Builder().
setPath(new Path(entryPrefix + i)).setPool(pool).
// Check that state came back up setExpiration(
pit = dfs.listCachePools(); CacheDirectiveInfo.Expiration.newAbsolute(expiry.getTime())).
assertTrue("No cache pools found", pit.hasNext()); build());
info = pit.next().getInfo(); }
assertEquals(pool, info.getPoolName()); RemoteIterator<CacheDirectiveEntry> dit
assertEquals(pool, info.getPoolName()); = dfs.listCacheDirectives(null);
assertEquals(groupName, info.getGroupName()); for (int i=0; i<numEntries; i++) {
assertEquals(mode, info.getMode()); assertTrue("Unexpected # of cache entries: " + i, dit.hasNext());
assertEquals(limit, (long)info.getLimit()); CacheDirectiveInfo cd = dit.next().getInfo();
assertFalse("Unexpected # of cache pools found", pit.hasNext()); assertEquals(i+1, cd.getId().longValue());
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath());
dit = dfs.listCacheDirectives(null); assertEquals(pool, cd.getPool());
for (int i=0; i<numEntries; i++) { }
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext()); assertFalse("Unexpected # of cache directives found", dit.hasNext());
CacheDirectiveInfo cd = dit.next().getInfo();
assertEquals(i+1, cd.getId().longValue()); // Checkpoint once to set some cache pools and directives on 2NN side
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath()); secondary.doCheckpoint();
assertEquals(pool, cd.getPool());
assertEquals(expiry.getTime(), cd.getExpiration().getMillis()); // Add some more CacheManager state
} final String imagePool = "imagePool";
assertFalse("Unexpected # of cache directives found", dit.hasNext()); dfs.addCachePool(new CachePoolInfo(imagePool));
prevId = dfs.addCacheDirective(new CacheDirectiveInfo.Builder()
.setPath(new Path("/image")).setPool(imagePool).build());
long nextId = dfs.addCacheDirective( // Save a new image to force a fresh fsimage download
new CacheDirectiveInfo.Builder(). dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
setPath(new Path("/foobar")).setPool(pool).build()); dfs.saveNamespace();
assertEquals(prevId + 1, nextId); dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
// Checkpoint again forcing a reload of FSN state
boolean fetchImage = secondary.doCheckpoint();
assertTrue("Secondary should have fetched a new fsimage from NameNode",
fetchImage);
// Remove temp pool and directive
dfs.removeCachePool(imagePool);
// Restart namenode
cluster.restartNameNode();
// Check that state came back up
pit = dfs.listCachePools();
assertTrue("No cache pools found", pit.hasNext());
info = pit.next().getInfo();
assertEquals(pool, info.getPoolName());
assertEquals(pool, info.getPoolName());
assertEquals(groupName, info.getGroupName());
assertEquals(mode, info.getMode());
assertEquals(limit, (long)info.getLimit());
assertFalse("Unexpected # of cache pools found", pit.hasNext());
dit = dfs.listCacheDirectives(null);
for (int i=0; i<numEntries; i++) {
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext());
CacheDirectiveInfo cd = dit.next().getInfo();
assertEquals(i+1, cd.getId().longValue());
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath());
assertEquals(pool, cd.getPool());
assertEquals(expiry.getTime(), cd.getExpiration().getMillis());
}
assertFalse("Unexpected # of cache directives found", dit.hasNext());
long nextId = dfs.addCacheDirective(
new CacheDirectiveInfo.Builder().
setPath(new Path("/foobar")).setPool(pool).build());
assertEquals(prevId + 1, nextId);
} finally {
if (secondary != null) {
secondary.shutdown();
}
}
} }
/** /**

View File

@ -1637,7 +1637,7 @@ public void testEditFailureOnFirstCheckpoint() throws IOException {
* Test that the secondary namenode correctly deletes temporary edits * Test that the secondary namenode correctly deletes temporary edits
* on startup. * on startup.
*/ */
@Test(timeout = 30000) @Test(timeout = 60000)
public void testDeleteTemporaryEditsOnStartup() throws IOException { public void testDeleteTemporaryEditsOnStartup() throws IOException {
Configuration conf = new HdfsConfiguration(); Configuration conf = new HdfsConfiguration();
SecondaryNameNode secondary = null; SecondaryNameNode secondary = null;