HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum error. Contributed by Andrew Wang.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1456630 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bcabbcdf4c
commit
807e08334e
|
@ -417,6 +417,9 @@ Release 2.0.5-beta - UNRELEASED
|
||||||
HDFS-3277. fail over to loading a different FSImage if the first one we
|
HDFS-3277. fail over to loading a different FSImage if the first one we
|
||||||
try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
|
try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
|
||||||
|
|
||||||
|
HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum
|
||||||
|
error. (Andrew Wang via atm)
|
||||||
|
|
||||||
Release 2.0.4-alpha - UNRELEASED
|
Release 2.0.4-alpha - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -44,4 +44,5 @@ class CheckpointFaultInjector {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void afterMD5Rename() throws IOException {}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1103,7 +1103,7 @@ public class FSImage implements Closeable {
|
||||||
*/
|
*/
|
||||||
public synchronized void saveDigestAndRenameCheckpointImage(
|
public synchronized void saveDigestAndRenameCheckpointImage(
|
||||||
long txid, MD5Hash digest) throws IOException {
|
long txid, MD5Hash digest) throws IOException {
|
||||||
renameCheckpoint(txid);
|
// Write and rename MD5 file
|
||||||
List<StorageDirectory> badSds = Lists.newArrayList();
|
List<StorageDirectory> badSds = Lists.newArrayList();
|
||||||
|
|
||||||
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
|
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
|
||||||
|
@ -1116,6 +1116,10 @@ public class FSImage implements Closeable {
|
||||||
}
|
}
|
||||||
storage.reportErrorsOnDirectories(badSds);
|
storage.reportErrorsOnDirectories(badSds);
|
||||||
|
|
||||||
|
CheckpointFaultInjector.getInstance().afterMD5Rename();
|
||||||
|
|
||||||
|
// Rename image from tmp file
|
||||||
|
renameCheckpoint(txid);
|
||||||
// So long as this is the newest image available,
|
// So long as this is the newest image available,
|
||||||
// advertise it as such to other checkpointers
|
// advertise it as such to other checkpointers
|
||||||
// from now on
|
// from now on
|
||||||
|
|
|
@ -231,7 +231,7 @@ public class TestCheckpoint {
|
||||||
/*
|
/*
|
||||||
* Simulate exception during edit replay.
|
* Simulate exception during edit replay.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=5000)
|
@Test(timeout=30000)
|
||||||
public void testReloadOnEditReplayFailure () throws IOException {
|
public void testReloadOnEditReplayFailure () throws IOException {
|
||||||
Configuration conf = new HdfsConfiguration();
|
Configuration conf = new HdfsConfiguration();
|
||||||
FSDataOutputStream fos = null;
|
FSDataOutputStream fos = null;
|
||||||
|
@ -1411,6 +1411,60 @@ public class TestCheckpoint {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test NN restart if a failure happens in between creating the fsimage
|
||||||
|
* MD5 file and renaming the fsimage.
|
||||||
|
*/
|
||||||
|
@Test(timeout=30000)
|
||||||
|
public void testFailureBeforeRename () throws IOException {
|
||||||
|
Configuration conf = new HdfsConfiguration();
|
||||||
|
FSDataOutputStream fos = null;
|
||||||
|
SecondaryNameNode secondary = null;
|
||||||
|
MiniDFSCluster cluster = null;
|
||||||
|
FileSystem fs = null;
|
||||||
|
NameNode namenode = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
|
||||||
|
.build();
|
||||||
|
cluster.waitActive();
|
||||||
|
namenode = cluster.getNameNode();
|
||||||
|
fs = cluster.getFileSystem();
|
||||||
|
secondary = startSecondaryNameNode(conf);
|
||||||
|
fos = fs.create(new Path("tmpfile0"));
|
||||||
|
fos.write(new byte[] { 0, 1, 2, 3 });
|
||||||
|
secondary.doCheckpoint();
|
||||||
|
fos.write(new byte[] { 0, 1, 2, 3 });
|
||||||
|
fos.hsync();
|
||||||
|
|
||||||
|
// Cause merge to fail in next checkpoint.
|
||||||
|
Mockito.doThrow(new IOException(
|
||||||
|
"Injecting failure after MD5Rename"))
|
||||||
|
.when(faultInjector).afterMD5Rename();
|
||||||
|
|
||||||
|
try {
|
||||||
|
secondary.doCheckpoint();
|
||||||
|
fail("Fault injection failed.");
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
// This is expected.
|
||||||
|
}
|
||||||
|
Mockito.reset(faultInjector);
|
||||||
|
// Namenode should still restart successfully
|
||||||
|
cluster.restartNameNode();
|
||||||
|
} finally {
|
||||||
|
if (secondary != null) {
|
||||||
|
secondary.shutdown();
|
||||||
|
}
|
||||||
|
if (fs != null) {
|
||||||
|
fs.close();
|
||||||
|
}
|
||||||
|
if (cluster != null) {
|
||||||
|
cluster.shutdown();
|
||||||
|
}
|
||||||
|
Mockito.reset(faultInjector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case where two secondary namenodes are checkpointing the same
|
* Test case where two secondary namenodes are checkpointing the same
|
||||||
* NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}
|
* NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}
|
||||||
|
|
Loading…
Reference in New Issue