HDFS-6120. Fix and improve safe mode log messages. (Arpit Agarwal)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1580047 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Arpit Agarwal 2014-03-21 20:57:13 +00:00
parent 2fbd94db3e
commit fc53af9c4c
4 changed files with 56 additions and 41 deletions

View File

@ -430,6 +430,8 @@ Release 2.4.0 - UNRELEASED
HDFS-6138. Add a user guide for how to use viewfs with federation. HDFS-6138. Add a user guide for how to use viewfs with federation.
(sanjay and szetszwo via szetszwo) (sanjay and szetszwo via szetszwo)
HDFS-6120. Fix and improve safe mode log messages. (Arpit Agarwal)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-5790. LeaseManager.findPath is very slow when many leases need recovery HDFS-5790. LeaseManager.findPath is very slow when many leases need recovery

View File

@ -91,7 +91,6 @@ import java.io.BufferedWriter;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.DataInput; import java.io.DataInput;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileOutputStream; import java.io.FileOutputStream;
@ -4823,13 +4822,21 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
* @return true if can leave or false otherwise. * @return true if can leave or false otherwise.
*/ */
private synchronized boolean canLeave() { private synchronized boolean canLeave() {
if (reached == 0) if (reached == 0) {
return false;
if (now() - reached < extension) {
reportStatus("STATE* Safe mode ON.", false);
return false; return false;
} }
return !needEnter();
if (now() - reached < extension) {
reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
return false;
}
if (needEnter()) {
reportStatus("STATE* Safe mode ON, thresholds not met.", false);
return false;
}
return true;
} }
/** /**
@ -4973,56 +4980,59 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
* A tip on how safe mode is to be turned off: manually or automatically. * A tip on how safe mode is to be turned off: manually or automatically.
*/ */
String getTurnOffTip() { String getTurnOffTip() {
if(!isOn()) if(!isOn()) {
return "Safe mode is OFF."; return "Safe mode is OFF.";
}
//Manual OR low-resource safemode. (Admin intervention required) //Manual OR low-resource safemode. (Admin intervention required)
String leaveMsg = "It was turned on manually. "; String adminMsg = "It was turned on manually. ";
if (areResourcesLow()) { if (areResourcesLow()) {
leaveMsg = "Resources are low on NN. Please add or free up more " adminMsg = "Resources are low on NN. Please add or free up more "
+ "resources then turn off safe mode manually. NOTE: If you turn off" + "resources then turn off safe mode manually. NOTE: If you turn off"
+ " safe mode before adding resources, " + " safe mode before adding resources, "
+ "the NN will immediately return to safe mode. "; + "the NN will immediately return to safe mode. ";
} }
if (isManual() || areResourcesLow()) { if (isManual() || areResourcesLow()) {
return leaveMsg return adminMsg
+ "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off."; + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
} }
//Automatic safemode. System will come out of safemode automatically. boolean thresholdsMet = true;
leaveMsg = "Safe mode will be turned off automatically";
int numLive = getNumLiveDataNodes(); int numLive = getNumLiveDataNodes();
String msg = ""; String msg = "";
if (reached == 0) { if (blockSafe < blockThreshold) {
if (blockSafe < blockThreshold) { msg += String.format(
msg += String.format( "The reported blocks %d needs additional %d"
"The reported blocks %d needs additional %d" + " blocks to reach the threshold %.4f of total blocks %d.\n",
+ " blocks to reach the threshold %.4f of total blocks %d.\n", blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal); thresholdsMet = false;
}
if (numLive < datanodeThreshold) {
msg += String.format(
"The number of live datanodes %d needs an additional %d live "
+ "datanodes to reach the minimum number %d.\n",
numLive, (datanodeThreshold - numLive), datanodeThreshold);
}
} else { } else {
msg = String.format("The reported blocks %d has reached the threshold" msg += String.format("The reported blocks %d has reached the threshold"
+ " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal); + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
}
if (numLive < datanodeThreshold) {
msg += String.format(
"The number of live datanodes %d needs an additional %d live "
+ "datanodes to reach the minimum number %d.\n",
numLive, (datanodeThreshold - numLive), datanodeThreshold);
thresholdsMet = false;
} else {
msg += String.format("The number of live datanodes %d has reached " msg += String.format("The number of live datanodes %d has reached "
+ "the minimum number %d. ", + "the minimum number %d. ",
numLive, datanodeThreshold); numLive, datanodeThreshold);
} }
msg += leaveMsg; msg += (reached > 0) ? "In safe mode extension. " : "";
// threshold is not reached or manual or resources low msg += "Safe mode will be turned off automatically ";
if(reached == 0 || (isManual() && !areResourcesLow())) {
return msg; if (!thresholdsMet) {
msg += "once the thresholds have been reached.";
} else if (reached + extension - now() > 0) {
msg += ("in " + (reached + extension - now()) / 1000 + " seconds.");
} else {
msg += "soon.";
} }
// extension period is in progress
return msg + (reached + extension - now() > 0 ? return msg;
" in " + (reached + extension - now()) / 1000 + " seconds."
: " soon.");
} }
/** /**

View File

@ -182,7 +182,9 @@ public class TestSafeMode {
String status = nn.getNamesystem().getSafemode(); String status = nn.getNamesystem().getSafemode();
assertEquals("Safe mode is ON. The reported blocks 0 needs additional " + assertEquals("Safe mode is ON. The reported blocks 0 needs additional " +
"15 blocks to reach the threshold 0.9990 of total blocks 15.\n" + "15 blocks to reach the threshold 0.9990 of total blocks 15.\n" +
"Safe mode will be turned off automatically", status); "The number of live datanodes 0 has reached the minimum number 0. " +
"Safe mode will be turned off automatically once the thresholds " +
"have been reached.", status);
assertFalse("Mis-replicated block queues should not be initialized " + assertFalse("Mis-replicated block queues should not be initialized " +
"until threshold is crossed", "until threshold is crossed",
NameNodeAdapter.safeModeInitializedReplQueues(nn)); NameNodeAdapter.safeModeInitializedReplQueues(nn));

View File

@ -495,7 +495,8 @@ public class TestHASafeMode {
"Safe mode is ON. The reported blocks " + safe + " has reached the " "Safe mode is ON. The reported blocks " + safe + " has reached the "
+ "threshold 0.9990 of total blocks " + total + ". The number of " + "threshold 0.9990 of total blocks " + total + ". The number of "
+ "live datanodes " + numNodes + " has reached the minimum number " + "live datanodes " + numNodes + " has reached the minimum number "
+ nodeThresh + ". Safe mode will be turned off automatically")); + nodeThresh + ". In safe mode extension. "
+ "Safe mode will be turned off automatically"));
} else { } else {
int additional = total - safe; int additional = total - safe;
assertTrue("Bad safemode status: '" + status + "'", assertTrue("Bad safemode status: '" + status + "'",
@ -565,8 +566,8 @@ public class TestHASafeMode {
status.startsWith( status.startsWith(
"Safe mode is ON. The reported blocks 10 has reached the threshold " "Safe mode is ON. The reported blocks 10 has reached the threshold "
+ "0.9990 of total blocks 10. The number of live datanodes 3 has " + "0.9990 of total blocks 10. The number of live datanodes 3 has "
+ "reached the minimum number 0. Safe mode will be turned off " + "reached the minimum number 0. In safe mode extension. "
+ "automatically")); + "Safe mode will be turned off automatically"));
// Delete those blocks while the SBN is in safe mode. // Delete those blocks while the SBN is in safe mode.
// Immediately roll the edit log before the actual deletions are sent // Immediately roll the edit log before the actual deletions are sent