HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1562645 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3e0bbcb688
commit
ec9c6aaac8
|
@ -540,6 +540,9 @@ Release 2.3.0 - UNRELEASED
|
||||||
HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be
|
HDFS-5721. sharedEditsImage in Namenode#initializeSharedEdits() should be
|
||||||
closed before method returns (Ted Yu via todd)
|
closed before method returns (Ted Yu via todd)
|
||||||
|
|
||||||
|
HDFS-5845. SecondaryNameNode dies when checkpointing with cache pools.
|
||||||
|
(wang)
|
||||||
|
|
||||||
BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
|
BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
|
||||||
|
|
||||||
HDFS-4985. Add storage type to the protocol and expose it in block report
|
HDFS-4985. Add storage type to the protocol and expose it in block report
|
||||||
|
|
|
@ -193,6 +193,17 @@ public final class CacheManager {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets all tracked directives and pools. Called during 2NN checkpointing to
|
||||||
|
* reset FSNamesystem state. See {FSNamesystem{@link #clear()}.
|
||||||
|
*/
|
||||||
|
void clear() {
|
||||||
|
directivesById.clear();
|
||||||
|
directivesByPath.clear();
|
||||||
|
cachePools.clear();
|
||||||
|
nextDirectiveId = 1;
|
||||||
|
}
|
||||||
|
|
||||||
public void startMonitorThread() {
|
public void startMonitorThread() {
|
||||||
crmLock.lock();
|
crmLock.lock();
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -521,6 +521,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
leaseManager.removeAllLeases();
|
leaseManager.removeAllLeases();
|
||||||
inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
|
inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
|
||||||
snapshotManager.clearSnapshottableDirs();
|
snapshotManager.clearSnapshottableDirs();
|
||||||
|
cacheManager.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
|
|
|
@ -1001,7 +1001,12 @@ public class SecondaryNameNode implements Runnable {
|
||||||
sig.mostRecentCheckpointTxId + " even though it should have " +
|
sig.mostRecentCheckpointTxId + " even though it should have " +
|
||||||
"just been downloaded");
|
"just been downloaded");
|
||||||
}
|
}
|
||||||
dstImage.reloadFromImageFile(file, dstNamesystem);
|
dstNamesystem.writeLock();
|
||||||
|
try {
|
||||||
|
dstImage.reloadFromImageFile(file, dstNamesystem);
|
||||||
|
} finally {
|
||||||
|
dstNamesystem.writeUnlock();
|
||||||
|
}
|
||||||
dstNamesystem.dir.imageLoadComplete();
|
dstNamesystem.dir.imageLoadComplete();
|
||||||
}
|
}
|
||||||
// error simulation code for junit test
|
// error simulation code for junit test
|
||||||
|
|
|
@ -69,6 +69,7 @@ import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
|
||||||
import org.apache.hadoop.hdfs.protocol.CachePoolStats;
|
import org.apache.hadoop.hdfs.protocol.CachePoolStats;
|
||||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
|
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
@ -528,77 +529,111 @@ public class TestCacheDirectives {
|
||||||
|
|
||||||
@Test(timeout=60000)
|
@Test(timeout=60000)
|
||||||
public void testCacheManagerRestart() throws Exception {
|
public void testCacheManagerRestart() throws Exception {
|
||||||
// Create and validate a pool
|
SecondaryNameNode secondary = null;
|
||||||
final String pool = "poolparty";
|
try {
|
||||||
String groupName = "partygroup";
|
// Start a secondary namenode
|
||||||
FsPermission mode = new FsPermission((short)0777);
|
conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY,
|
||||||
long limit = 747;
|
"0.0.0.0:0");
|
||||||
dfs.addCachePool(new CachePoolInfo(pool)
|
secondary = new SecondaryNameNode(conf);
|
||||||
.setGroupName(groupName)
|
|
||||||
.setMode(mode)
|
|
||||||
.setLimit(limit));
|
|
||||||
RemoteIterator<CachePoolEntry> pit = dfs.listCachePools();
|
|
||||||
assertTrue("No cache pools found", pit.hasNext());
|
|
||||||
CachePoolInfo info = pit.next().getInfo();
|
|
||||||
assertEquals(pool, info.getPoolName());
|
|
||||||
assertEquals(groupName, info.getGroupName());
|
|
||||||
assertEquals(mode, info.getMode());
|
|
||||||
assertEquals(limit, (long)info.getLimit());
|
|
||||||
assertFalse("Unexpected # of cache pools found", pit.hasNext());
|
|
||||||
|
|
||||||
// Create some cache entries
|
// Create and validate a pool
|
||||||
int numEntries = 10;
|
final String pool = "poolparty";
|
||||||
String entryPrefix = "/party-";
|
String groupName = "partygroup";
|
||||||
long prevId = -1;
|
FsPermission mode = new FsPermission((short)0777);
|
||||||
final Date expiry = new Date();
|
long limit = 747;
|
||||||
for (int i=0; i<numEntries; i++) {
|
dfs.addCachePool(new CachePoolInfo(pool)
|
||||||
prevId = dfs.addCacheDirective(
|
.setGroupName(groupName)
|
||||||
new CacheDirectiveInfo.Builder().
|
.setMode(mode)
|
||||||
setPath(new Path(entryPrefix + i)).setPool(pool).
|
.setLimit(limit));
|
||||||
setExpiration(
|
RemoteIterator<CachePoolEntry> pit = dfs.listCachePools();
|
||||||
CacheDirectiveInfo.Expiration.newAbsolute(expiry.getTime())).
|
assertTrue("No cache pools found", pit.hasNext());
|
||||||
build());
|
CachePoolInfo info = pit.next().getInfo();
|
||||||
|
assertEquals(pool, info.getPoolName());
|
||||||
|
assertEquals(groupName, info.getGroupName());
|
||||||
|
assertEquals(mode, info.getMode());
|
||||||
|
assertEquals(limit, (long)info.getLimit());
|
||||||
|
assertFalse("Unexpected # of cache pools found", pit.hasNext());
|
||||||
|
|
||||||
|
// Create some cache entries
|
||||||
|
int numEntries = 10;
|
||||||
|
String entryPrefix = "/party-";
|
||||||
|
long prevId = -1;
|
||||||
|
final Date expiry = new Date();
|
||||||
|
for (int i=0; i<numEntries; i++) {
|
||||||
|
prevId = dfs.addCacheDirective(
|
||||||
|
new CacheDirectiveInfo.Builder().
|
||||||
|
setPath(new Path(entryPrefix + i)).setPool(pool).
|
||||||
|
setExpiration(
|
||||||
|
CacheDirectiveInfo.Expiration.newAbsolute(expiry.getTime())).
|
||||||
|
build());
|
||||||
|
}
|
||||||
|
RemoteIterator<CacheDirectiveEntry> dit
|
||||||
|
= dfs.listCacheDirectives(null);
|
||||||
|
for (int i=0; i<numEntries; i++) {
|
||||||
|
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext());
|
||||||
|
CacheDirectiveInfo cd = dit.next().getInfo();
|
||||||
|
assertEquals(i+1, cd.getId().longValue());
|
||||||
|
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath());
|
||||||
|
assertEquals(pool, cd.getPool());
|
||||||
|
}
|
||||||
|
assertFalse("Unexpected # of cache directives found", dit.hasNext());
|
||||||
|
|
||||||
|
// Checkpoint once to set some cache pools and directives on 2NN side
|
||||||
|
secondary.doCheckpoint();
|
||||||
|
|
||||||
|
// Add some more CacheManager state
|
||||||
|
final String imagePool = "imagePool";
|
||||||
|
dfs.addCachePool(new CachePoolInfo(imagePool));
|
||||||
|
prevId = dfs.addCacheDirective(new CacheDirectiveInfo.Builder()
|
||||||
|
.setPath(new Path("/image")).setPool(imagePool).build());
|
||||||
|
|
||||||
|
// Save a new image to force a fresh fsimage download
|
||||||
|
dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
|
||||||
|
dfs.saveNamespace();
|
||||||
|
dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
|
||||||
|
|
||||||
|
// Checkpoint again forcing a reload of FSN state
|
||||||
|
boolean fetchImage = secondary.doCheckpoint();
|
||||||
|
assertTrue("Secondary should have fetched a new fsimage from NameNode",
|
||||||
|
fetchImage);
|
||||||
|
|
||||||
|
// Remove temp pool and directive
|
||||||
|
dfs.removeCachePool(imagePool);
|
||||||
|
|
||||||
|
// Restart namenode
|
||||||
|
cluster.restartNameNode();
|
||||||
|
|
||||||
|
// Check that state came back up
|
||||||
|
pit = dfs.listCachePools();
|
||||||
|
assertTrue("No cache pools found", pit.hasNext());
|
||||||
|
info = pit.next().getInfo();
|
||||||
|
assertEquals(pool, info.getPoolName());
|
||||||
|
assertEquals(pool, info.getPoolName());
|
||||||
|
assertEquals(groupName, info.getGroupName());
|
||||||
|
assertEquals(mode, info.getMode());
|
||||||
|
assertEquals(limit, (long)info.getLimit());
|
||||||
|
assertFalse("Unexpected # of cache pools found", pit.hasNext());
|
||||||
|
|
||||||
|
dit = dfs.listCacheDirectives(null);
|
||||||
|
for (int i=0; i<numEntries; i++) {
|
||||||
|
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext());
|
||||||
|
CacheDirectiveInfo cd = dit.next().getInfo();
|
||||||
|
assertEquals(i+1, cd.getId().longValue());
|
||||||
|
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath());
|
||||||
|
assertEquals(pool, cd.getPool());
|
||||||
|
assertEquals(expiry.getTime(), cd.getExpiration().getMillis());
|
||||||
|
}
|
||||||
|
assertFalse("Unexpected # of cache directives found", dit.hasNext());
|
||||||
|
|
||||||
|
long nextId = dfs.addCacheDirective(
|
||||||
|
new CacheDirectiveInfo.Builder().
|
||||||
|
setPath(new Path("/foobar")).setPool(pool).build());
|
||||||
|
assertEquals(prevId + 1, nextId);
|
||||||
|
} finally {
|
||||||
|
if (secondary != null) {
|
||||||
|
secondary.shutdown();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
RemoteIterator<CacheDirectiveEntry> dit
|
|
||||||
= dfs.listCacheDirectives(null);
|
|
||||||
for (int i=0; i<numEntries; i++) {
|
|
||||||
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext());
|
|
||||||
CacheDirectiveInfo cd = dit.next().getInfo();
|
|
||||||
assertEquals(i+1, cd.getId().longValue());
|
|
||||||
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath());
|
|
||||||
assertEquals(pool, cd.getPool());
|
|
||||||
}
|
|
||||||
assertFalse("Unexpected # of cache directives found", dit.hasNext());
|
|
||||||
|
|
||||||
// Restart namenode
|
|
||||||
cluster.restartNameNode();
|
|
||||||
|
|
||||||
// Check that state came back up
|
|
||||||
pit = dfs.listCachePools();
|
|
||||||
assertTrue("No cache pools found", pit.hasNext());
|
|
||||||
info = pit.next().getInfo();
|
|
||||||
assertEquals(pool, info.getPoolName());
|
|
||||||
assertEquals(pool, info.getPoolName());
|
|
||||||
assertEquals(groupName, info.getGroupName());
|
|
||||||
assertEquals(mode, info.getMode());
|
|
||||||
assertEquals(limit, (long)info.getLimit());
|
|
||||||
assertFalse("Unexpected # of cache pools found", pit.hasNext());
|
|
||||||
|
|
||||||
dit = dfs.listCacheDirectives(null);
|
|
||||||
for (int i=0; i<numEntries; i++) {
|
|
||||||
assertTrue("Unexpected # of cache entries: " + i, dit.hasNext());
|
|
||||||
CacheDirectiveInfo cd = dit.next().getInfo();
|
|
||||||
assertEquals(i+1, cd.getId().longValue());
|
|
||||||
assertEquals(entryPrefix + i, cd.getPath().toUri().getPath());
|
|
||||||
assertEquals(pool, cd.getPool());
|
|
||||||
assertEquals(expiry.getTime(), cd.getExpiration().getMillis());
|
|
||||||
}
|
|
||||||
assertFalse("Unexpected # of cache directives found", dit.hasNext());
|
|
||||||
|
|
||||||
long nextId = dfs.addCacheDirective(
|
|
||||||
new CacheDirectiveInfo.Builder().
|
|
||||||
setPath(new Path("/foobar")).setPool(pool).build());
|
|
||||||
assertEquals(prevId + 1, nextId);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1637,7 +1637,7 @@ public class TestCheckpoint {
|
||||||
* Test that the secondary namenode correctly deletes temporary edits
|
* Test that the secondary namenode correctly deletes temporary edits
|
||||||
* on startup.
|
* on startup.
|
||||||
*/
|
*/
|
||||||
@Test(timeout = 30000)
|
@Test(timeout = 60000)
|
||||||
public void testDeleteTemporaryEditsOnStartup() throws IOException {
|
public void testDeleteTemporaryEditsOnStartup() throws IOException {
|
||||||
Configuration conf = new HdfsConfiguration();
|
Configuration conf = new HdfsConfiguration();
|
||||||
SecondaryNameNode secondary = null;
|
SecondaryNameNode secondary = null;
|
||||||
|
|
Loading…
Reference in New Issue