HDFS-9023. When NN is not able to identify DN for replication, reason behind it can be logged.

This commit is contained in:
Xiao Chen 2017-12-28 11:52:49 -08:00
parent d31c9d8c49
commit 5bf7e594d7
2 changed files with 64 additions and 12 deletions

View File

@ -62,6 +62,28 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
}; };
private static final ThreadLocal<HashMap<NodeNotChosenReason, Integer>>
CHOOSE_RANDOM_REASONS = ThreadLocal
.withInitial(() -> new HashMap<NodeNotChosenReason, Integer>());
private enum NodeNotChosenReason {
NOT_IN_SERVICE("the node isn't in service"),
NODE_STALE("the node is stale"),
NODE_TOO_BUSY("the node is too busy"),
TOO_MANY_NODES_ON_RACK("the rack has too many chosen nodes"),
NOT_ENOUGH_STORAGE_SPACE("no enough storage space to place the block");
private final String text;
NodeNotChosenReason(final String logText) {
text = logText;
}
private String getText() {
return text;
}
}
protected boolean considerLoad; protected boolean considerLoad;
protected double considerLoadFactor; protected double considerLoadFactor;
private boolean preferLocalNode; private boolean preferLocalNode;
@ -711,6 +733,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
builder.setLength(0); builder.setLength(0);
builder.append("["); builder.append("[");
} }
CHOOSE_RANDOM_REASONS.get().clear();
boolean badTarget = false; boolean badTarget = false;
DatanodeStorageInfo firstChosen = null; DatanodeStorageInfo firstChosen = null;
while (numOfReplicas > 0) { while (numOfReplicas > 0) {
@ -781,14 +804,24 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
if (numOfReplicas>0) { if (numOfReplicas>0) {
String detail = enableDebugLogging; String detail = enableDebugLogging;
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled() && builder != null) {
if (badTarget && builder != null) {
detail = builder.toString(); detail = builder.toString();
if (badTarget) {
builder.setLength(0); builder.setLength(0);
} else { } else {
if (detail.length() > 1) {
// only log if there's more than "[", which is always appended at
// the beginning of this method.
LOG.debug(detail);
}
detail = ""; detail = "";
} }
} }
final HashMap<NodeNotChosenReason, Integer> reasonMap =
CHOOSE_RANDOM_REASONS.get();
if (!reasonMap.isEmpty()) {
LOG.info("Not enough replicas was chosen. Reason:{}", reasonMap);
}
throw new NotEnoughReplicasException(detail); throw new NotEnoughReplicasException(detail);
} }
@ -834,19 +867,38 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
if (storage != null) { if (storage != null) {
results.add(storage); results.add(storage);
} else { } else {
logNodeIsNotChosen(dnd, "no good storage to place the block "); logNodeIsNotChosen(dnd, NodeNotChosenReason.NOT_ENOUGH_STORAGE_SPACE,
" for storage type " + storageType);
} }
return storage; return storage;
} }
private static void logNodeIsNotChosen(DatanodeDescriptor node, private static void logNodeIsNotChosen(DatanodeDescriptor node,
String reason) { NodeNotChosenReason reason) {
logNodeIsNotChosen(node, reason, null);
}
private static void logNodeIsNotChosen(DatanodeDescriptor node,
NodeNotChosenReason reason, String reasonDetails) {
assert reason != null;
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
// build the error message for later use. // build the error message for later use.
debugLoggingBuilder.get() debugLoggingBuilder.get()
.append("\n Datanode ").append(node) .append("\n Datanode ").append(node)
.append(" is not chosen since ").append(reason).append("."); .append(" is not chosen since ").append(reason.getText());
if (reasonDetails != null) {
debugLoggingBuilder.get().append(" ").append(reasonDetails);
} }
debugLoggingBuilder.get().append(".");
}
// always populate reason map to log high level reasons.
final HashMap<NodeNotChosenReason, Integer> reasonMap =
CHOOSE_RANDOM_REASONS.get();
Integer base = reasonMap.get(reason);
if (base == null) {
base = 0;
}
reasonMap.put(reason, base + 1);
} }
/** /**
@ -868,13 +920,13 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
boolean avoidStaleNodes) { boolean avoidStaleNodes) {
// check if the node is (being) decommissioned // check if the node is (being) decommissioned
if (!node.isInService()) { if (!node.isInService()) {
logNodeIsNotChosen(node, "the node isn't in service."); logNodeIsNotChosen(node, NodeNotChosenReason.NOT_IN_SERVICE);
return false; return false;
} }
if (avoidStaleNodes) { if (avoidStaleNodes) {
if (node.isStale(this.staleInterval)) { if (node.isStale(this.staleInterval)) {
logNodeIsNotChosen(node, "the node is stale "); logNodeIsNotChosen(node, NodeNotChosenReason.NODE_STALE);
return false; return false;
} }
} }
@ -885,8 +937,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
stats.getInServiceXceiverAverage(); stats.getInServiceXceiverAverage();
final int nodeLoad = node.getXceiverCount(); final int nodeLoad = node.getXceiverCount();
if (nodeLoad > maxLoad) { if (nodeLoad > maxLoad) {
logNodeIsNotChosen(node, "the node is too busy (load: " + nodeLoad logNodeIsNotChosen(node, NodeNotChosenReason.NODE_TOO_BUSY,
+ " > " + maxLoad + ") "); "(load: " + nodeLoad + " > " + maxLoad + ")");
return false; return false;
} }
} }
@ -901,7 +953,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
} }
if (counter > maxTargetPerRack) { if (counter > maxTargetPerRack) {
logNodeIsNotChosen(node, "the rack has too many chosen nodes "); logNodeIsNotChosen(node, NodeNotChosenReason.TOO_MANY_NODES_ON_RACK);
return false; return false;
} }

View File

@ -764,7 +764,7 @@ public class DatanodeDescriptor extends DatanodeInfo {
} }
} }
if (requiredSize > remaining - scheduledSize) { if (requiredSize > remaining - scheduledSize) {
LOG.debug( BlockPlacementPolicy.LOG.debug(
"The node {} does not have enough {} space (required={}," "The node {} does not have enough {} space (required={},"
+ " scheduled={}, remaining={}).", + " scheduled={}, remaining={}).",
this, t, requiredSize, scheduledSize, remaining); this, t, requiredSize, scheduledSize, remaining);