HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum error. Contributed by Andrew Wang.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1456630 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2013-03-14 20:01:03 +00:00
parent bcabbcdf4c
commit 807e08334e
4 changed files with 64 additions and 2 deletions

View File

@ -417,6 +417,9 @@ Release 2.0.5-beta - UNRELEASED
HDFS-3277. fail over to loading a different FSImage if the first one we HDFS-3277. fail over to loading a different FSImage if the first one we
try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm) try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum
error. (Andrew Wang via atm)
Release 2.0.4-alpha - UNRELEASED Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -44,4 +44,5 @@ class CheckpointFaultInjector {
return false; return false;
} }
public void afterMD5Rename() throws IOException {}
} }

View File

@ -1103,7 +1103,7 @@ public class FSImage implements Closeable {
*/ */
public synchronized void saveDigestAndRenameCheckpointImage( public synchronized void saveDigestAndRenameCheckpointImage(
long txid, MD5Hash digest) throws IOException { long txid, MD5Hash digest) throws IOException {
renameCheckpoint(txid); // Write and rename MD5 file
List<StorageDirectory> badSds = Lists.newArrayList(); List<StorageDirectory> badSds = Lists.newArrayList();
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) { for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
@ -1116,6 +1116,10 @@ public class FSImage implements Closeable {
} }
storage.reportErrorsOnDirectories(badSds); storage.reportErrorsOnDirectories(badSds);
CheckpointFaultInjector.getInstance().afterMD5Rename();
// Rename image from tmp file
renameCheckpoint(txid);
// So long as this is the newest image available, // So long as this is the newest image available,
// advertise it as such to other checkpointers // advertise it as such to other checkpointers
// from now on // from now on

View File

@ -231,7 +231,7 @@ public class TestCheckpoint {
/* /*
* Simulate exception during edit replay. * Simulate exception during edit replay.
*/ */
@Test(timeout=5000) @Test(timeout=30000)
public void testReloadOnEditReplayFailure () throws IOException { public void testReloadOnEditReplayFailure () throws IOException {
Configuration conf = new HdfsConfiguration(); Configuration conf = new HdfsConfiguration();
FSDataOutputStream fos = null; FSDataOutputStream fos = null;
@ -1411,6 +1411,60 @@ public class TestCheckpoint {
} }
} }
/**
* Test NN restart if a failure happens in between creating the fsimage
* MD5 file and renaming the fsimage.
*/
@Test(timeout=30000)
public void testFailureBeforeRename () throws IOException {
Configuration conf = new HdfsConfiguration();
FSDataOutputStream fos = null;
SecondaryNameNode secondary = null;
MiniDFSCluster cluster = null;
FileSystem fs = null;
NameNode namenode = null;
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
.build();
cluster.waitActive();
namenode = cluster.getNameNode();
fs = cluster.getFileSystem();
secondary = startSecondaryNameNode(conf);
fos = fs.create(new Path("tmpfile0"));
fos.write(new byte[] { 0, 1, 2, 3 });
secondary.doCheckpoint();
fos.write(new byte[] { 0, 1, 2, 3 });
fos.hsync();
// Cause merge to fail in next checkpoint.
Mockito.doThrow(new IOException(
"Injecting failure after MD5Rename"))
.when(faultInjector).afterMD5Rename();
try {
secondary.doCheckpoint();
fail("Fault injection failed.");
} catch (IOException ioe) {
// This is expected.
}
Mockito.reset(faultInjector);
// Namenode should still restart successfully
cluster.restartNameNode();
} finally {
if (secondary != null) {
secondary.shutdown();
}
if (fs != null) {
fs.close();
}
if (cluster != null) {
cluster.shutdown();
}
Mockito.reset(faultInjector);
}
}
/** /**
* Test case where two secondary namenodes are checkpointing the same * Test case where two secondary namenodes are checkpointing the same
* NameNode. This differs from {@link #testMultipleSecondaryNamenodes()} * NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}