HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. Contributed by Hari Mankude and Todd Lipcon.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1229898 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
44d070cd01
commit
190dc1c91b
|
@ -93,3 +93,5 @@ HDFS-2730. Refactor shared HA-related test code into HATestUtil class (todd)
|
||||||
HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd)
|
HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd)
|
||||||
|
|
||||||
HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd)
|
HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd)
|
||||||
|
|
||||||
|
HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. (Hari Mankude and todd via todd)
|
||||||
|
|
|
@ -1361,7 +1361,7 @@ public class BlockManager {
|
||||||
|
|
||||||
// To minimize startup time, we discard any second (or later) block reports
|
// To minimize startup time, we discard any second (or later) block reports
|
||||||
// that we receive while still in startup phase.
|
// that we receive while still in startup phase.
|
||||||
if (namesystem.isInStartupSafeMode() && node.numBlocks() > 0) {
|
if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) {
|
||||||
NameNode.stateChangeLog.info("BLOCK* processReport: "
|
NameNode.stateChangeLog.info("BLOCK* processReport: "
|
||||||
+ "discarded non-initial block report from " + nodeID.getName()
|
+ "discarded non-initial block report from " + nodeID.getName()
|
||||||
+ " because namenode still in startup phase");
|
+ " because namenode still in startup phase");
|
||||||
|
|
|
@ -151,6 +151,10 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
private long lastBlocksScheduledRollTime = 0;
|
private long lastBlocksScheduledRollTime = 0;
|
||||||
private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
|
private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
|
||||||
private int volumeFailures = 0;
|
private int volumeFailures = 0;
|
||||||
|
|
||||||
|
/** Set to false after processing first block report */
|
||||||
|
private boolean firstBlockReport = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When set to true, the node is not in include list and is not allowed
|
* When set to true, the node is not in include list and is not allowed
|
||||||
* to communicate with the namenode
|
* to communicate with the namenode
|
||||||
|
@ -608,6 +612,11 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
if (heartbeatedSinceFailover) {
|
if (heartbeatedSinceFailover) {
|
||||||
blockContentsStale = false;
|
blockContentsStale = false;
|
||||||
}
|
}
|
||||||
|
firstBlockReport = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isFirstBlockReport() {
|
||||||
|
return firstBlockReport;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -192,7 +192,7 @@ public class TestHASafeMode {
|
||||||
* knows there should only be 90 blocks, but it's still in safemode.
|
* knows there should only be 90 blocks, but it's still in safemode.
|
||||||
* 8. NN2 doesn't ever recheck whether it should leave safemode.
|
* 8. NN2 doesn't ever recheck whether it should leave safemode.
|
||||||
*
|
*
|
||||||
* This is essentially the inverse of {@link #testBlocksAddedWhileStandbyShutdown()}
|
* This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()}
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testBlocksRemovedBeforeStandbyRestart() throws Exception {
|
public void testBlocksRemovedBeforeStandbyRestart() throws Exception {
|
||||||
|
@ -328,6 +328,39 @@ public class TestHASafeMode {
|
||||||
"total blocks 5. Safe mode will be turned off automatically"));
|
"total blocks 5. Safe mode will be turned off automatically"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Regression test for HDFS-2753. In this bug, the following sequence was
|
||||||
|
* observed:
|
||||||
|
* - Some blocks are written to DNs while the SBN was down. This causes
|
||||||
|
* the blockReceived messages to get queued in the BPServiceActor on the
|
||||||
|
* DN.
|
||||||
|
* - When the SBN returns, the DN re-registers with the SBN, and then
|
||||||
|
* flushes its blockReceived queue to the SBN before it sends its
|
||||||
|
* first block report. This caused the first block report to be
|
||||||
|
* incorrect ignored.
|
||||||
|
* - The SBN would become stuck in safemode.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testBlocksAddedWhileStandbyIsDown() throws Exception {
|
||||||
|
DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
|
||||||
|
|
||||||
|
banner("Stopping standby");
|
||||||
|
cluster.shutdownNameNode(1);
|
||||||
|
|
||||||
|
DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L);
|
||||||
|
|
||||||
|
banner("Rolling edit log so standby gets all edits on restart");
|
||||||
|
nn0.getRpcServer().rollEditLog();
|
||||||
|
|
||||||
|
restartStandby();
|
||||||
|
String status = nn1.getNamesystem().getSafemode();
|
||||||
|
assertTrue("Bad safemode status: '" + status + "'",
|
||||||
|
status.startsWith(
|
||||||
|
"Safe mode is ON." +
|
||||||
|
"The reported blocks 6 has reached the threshold 0.9990 of " +
|
||||||
|
"total blocks 6. Safe mode will be turned off automatically"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Print a big banner in the test log to make debug easier.
|
* Print a big banner in the test log to make debug easier.
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Reference in New Issue