From 947d484d35f1ace279b5ab7052df8597ad21e47d Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Thu, 28 Dec 2017 11:53:49 -0800 Subject: [PATCH] HDFS-9023. When NN is not able to identify DN for replication, reason behind it can be logged. (cherry picked from commit 50b233d35692d249d2115a2481cde770f50cf7a7) --- .../BlockPlacementPolicyDefault.java | 79 ++++++++++++++++--- .../blockmanagement/DatanodeDescriptor.java | 2 +- 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java index 50ca11a403b..8a72ec745a9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java @@ -61,6 +61,33 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { } }; + private static final ThreadLocal> + CHOOSE_RANDOM_REASONS = + new ThreadLocal>() { + @Override + protected HashMap initialValue() { + return new HashMap<>(); + } + }; + + private enum NodeNotChosenReason { + NOT_IN_SERVICE("the node isn't in service"), + NODE_STALE("the node is stale"), + NODE_TOO_BUSY("the node is too busy"), + TOO_MANY_NODES_ON_RACK("the rack has too many chosen nodes"), + NOT_ENOUGH_STORAGE_SPACE("no enough storage space to place the block"); + + private final String text; + + NodeNotChosenReason(final String logText) { + text = logText; + } + + private String getText() { + return text; + } + } + protected boolean considerLoad; protected double considerLoadFactor; private boolean preferLocalNode; @@ -710,6 +737,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { builder.setLength(0); builder.append("["); } + CHOOSE_RANDOM_REASONS.get().clear(); boolean badTarget = false; DatanodeStorageInfo firstChosen = null; while (numOfReplicas > 0) { @@ -780,14 +808,24 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { } if (numOfReplicas>0) { String detail = enableDebugLogging; - if (LOG.isDebugEnabled()) { - if (badTarget && builder != null) { - detail = builder.toString(); + if (LOG.isDebugEnabled() && builder != null) { + detail = builder.toString(); + if (badTarget) { builder.setLength(0); } else { + if (detail.length() > 1) { + // only log if there's more than "[", which is always appended at + // the beginning of this method. + LOG.debug(detail); + } detail = ""; } } + final HashMap reasonMap = + CHOOSE_RANDOM_REASONS.get(); + if (!reasonMap.isEmpty()) { + LOG.info("Not enough replicas was chosen. Reason:{}", reasonMap); + } throw new NotEnoughReplicasException(detail); } @@ -833,19 +871,38 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { if (storage != null) { results.add(storage); } else { - logNodeIsNotChosen(dnd, "no good storage to place the block "); + logNodeIsNotChosen(dnd, NodeNotChosenReason.NOT_ENOUGH_STORAGE_SPACE, + " for storage type " + storageType); } return storage; } private static void logNodeIsNotChosen(DatanodeDescriptor node, - String reason) { + NodeNotChosenReason reason) { + logNodeIsNotChosen(node, reason, null); + } + + private static void logNodeIsNotChosen(DatanodeDescriptor node, + NodeNotChosenReason reason, String reasonDetails) { + assert reason != null; if (LOG.isDebugEnabled()) { // build the error message for later use. debugLoggingBuilder.get() .append("\n Datanode ").append(node) - .append(" is not chosen since ").append(reason).append("."); + .append(" is not chosen since ").append(reason.getText()); + if (reasonDetails != null) { + debugLoggingBuilder.get().append(" ").append(reasonDetails); + } + debugLoggingBuilder.get().append("."); } + // always populate reason map to log high level reasons. + final HashMap reasonMap = + CHOOSE_RANDOM_REASONS.get(); + Integer base = reasonMap.get(reason); + if (base == null) { + base = 0; + } + reasonMap.put(reason, base + 1); } /** @@ -867,13 +924,13 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { boolean avoidStaleNodes) { // check if the node is (being) decommissioned if (!node.isInService()) { - logNodeIsNotChosen(node, "the node isn't in service."); + logNodeIsNotChosen(node, NodeNotChosenReason.NOT_IN_SERVICE); return false; } if (avoidStaleNodes) { if (node.isStale(this.staleInterval)) { - logNodeIsNotChosen(node, "the node is stale "); + logNodeIsNotChosen(node, NodeNotChosenReason.NODE_STALE); return false; } } @@ -884,8 +941,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { stats.getInServiceXceiverAverage(); final int nodeLoad = node.getXceiverCount(); if (nodeLoad > maxLoad) { - logNodeIsNotChosen(node, "the node is too busy (load: " + nodeLoad - + " > " + maxLoad + ") "); + logNodeIsNotChosen(node, NodeNotChosenReason.NODE_TOO_BUSY, + "(load: " + nodeLoad + " > " + maxLoad + ")"); return false; } } @@ -900,7 +957,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { } } if (counter > maxTargetPerRack) { - logNodeIsNotChosen(node, "the rack has too many chosen nodes "); + logNodeIsNotChosen(node, NodeNotChosenReason.TOO_MANY_NODES_ON_RACK); return false; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java index a725b44dcd2..dd59b6f9ec4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java @@ -683,7 +683,7 @@ public class DatanodeDescriptor extends DatanodeInfo { } } if (requiredSize > remaining - scheduledSize) { - LOG.debug( + BlockPlacementPolicy.LOG.debug( "The node {} does not have enough {} space (required={}," + " scheduled={}, remaining={}).", this, t, requiredSize, scheduledSize, remaining);