HDFS-3912. Detect and avoid stale datanodes for writes. Contributed by Jing Zhao

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1397211 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Suresh Srinivas 2012-10-11 18:08:27 +00:00
parent f174adfc15
commit 2887bbb33c
11 changed files with 635 additions and 163 deletions

View File

@ -13,10 +13,6 @@ Trunk (Unreleased)
HDFS-3601. Add BlockPlacementPolicyWithNodeGroup to support block placement HDFS-3601. Add BlockPlacementPolicyWithNodeGroup to support block placement
with 4-layer network topology. (Junping Du via szetszwo) with 4-layer network topology. (Junping Du via szetszwo)
HDFS-3703. Datanodes are marked stale if heartbeat is not received in
configured timeout and are selected as the last location to read from.
(Jing Zhao via suresh)
IMPROVEMENTS IMPROVEMENTS
HDFS-1620. Rename HdfsConstants -> HdfsServerConstants, FSConstants -> HDFS-1620. Rename HdfsConstants -> HdfsServerConstants, FSConstants ->
@ -238,6 +234,9 @@ Release 2.0.3-alpha - Unreleased
HDFS-2656. Add libwebhdfs, a pure C client based on WebHDFS. HDFS-2656. Add libwebhdfs, a pure C client based on WebHDFS.
(Jaimin D Jetly and Jing Zhao via szetszwo) (Jaimin D Jetly and Jing Zhao via szetszwo)
HDFS-3912. Detect and avoid stale datanodes for writes.
(Jing Zhao via suresh)
IMPROVEMENTS IMPROVEMENTS
HDFS-3925. Prettify PipelineAck#toString() for printing to a log HDFS-3925. Prettify PipelineAck#toString() for printing to a log
@ -349,6 +348,11 @@ Release 2.0.2-alpha - 2012-09-07
HDFS-2793. Add an admin command to trigger an edit log roll. (todd) HDFS-2793. Add an admin command to trigger an edit log roll. (todd)
HDFS-3703. Datanodes are marked stale if heartbeat is not received in
configured timeout and are selected as the last location to read from.
(Jing Zhao via suresh)
IMPROVEMENTS IMPROVEMENTS
HDFS-3390. DFSAdmin should print full stack traces of errors when DEBUG HDFS-3390. DFSAdmin should print full stack traces of errors when DEBUG

View File

@ -180,9 +180,21 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
// Whether to enable datanode's stale state detection and usage // Whether to enable datanode's stale state detection and usage
public static final String DFS_NAMENODE_CHECK_STALE_DATANODE_KEY = "dfs.namenode.check.stale.datanode"; public static final String DFS_NAMENODE_CHECK_STALE_DATANODE_KEY = "dfs.namenode.check.stale.datanode";
public static final boolean DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT = false; public static final boolean DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT = false;
// Whether to enable datanode's stale state detection and usage
public static final String DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY = "dfs.namenode.avoid.write.stale.datanode";
public static final boolean DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT = false;
// The default value of the time interval for marking datanodes as stale // The default value of the time interval for marking datanodes as stale
public static final String DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY = "dfs.namenode.stale.datanode.interval"; public static final String DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY = "dfs.namenode.stale.datanode.interval";
public static final long DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT = 30 * 1000; // 30s public static final long DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT = 30 * 1000; // 30s
// The stale interval cannot be too small since otherwise this may cause too frequent churn on stale states.
// This value uses the times of heartbeat interval to define the minimum value for stale interval.
public static final String DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_KEY = "dfs.namenode.stale.datanode.minimum.interval";
public static final int DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_DEFAULT = 3; // i.e. min_interval is 3 * heartbeat_interval = 9s
// When the number stale datanodes marked as stale reached this certian ratio,
// stop avoiding writing to stale nodes so as to prevent causing hotspots.
public static final String DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY = "dfs.namenode.write.stale.datanode.ratio";
public static final float DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT = 0.5f;
// Replication monitoring related keys // Replication monitoring related keys
public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION = public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION =

View File

@ -62,6 +62,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
protected NetworkTopology clusterMap; protected NetworkTopology clusterMap;
private FSClusterStats stats; private FSClusterStats stats;
protected long heartbeatInterval; // interval for DataNode heartbeats protected long heartbeatInterval; // interval for DataNode heartbeats
private long staleInterval; // interval used to identify stale DataNodes
/** /**
* A miss of that many heartbeats is tolerated for replica deletion policy. * A miss of that many heartbeats is tolerated for replica deletion policy.
*/ */
@ -78,7 +80,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
@Override @Override
public void initialize(Configuration conf, FSClusterStats stats, public void initialize(Configuration conf, FSClusterStats stats,
NetworkTopology clusterMap) { NetworkTopology clusterMap) {
this.considerLoad = conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, true); this.considerLoad = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, true);
this.stats = stats; this.stats = stats;
this.clusterMap = clusterMap; this.clusterMap = clusterMap;
this.heartbeatInterval = conf.getLong( this.heartbeatInterval = conf.getLong(
@ -87,6 +90,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
this.tolerateHeartbeatMultiplier = conf.getInt( this.tolerateHeartbeatMultiplier = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_KEY, DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_KEY,
DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_DEFAULT); DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_DEFAULT);
this.staleInterval = conf.getLong(
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);
} }
protected ThreadLocal<StringBuilder> threadLocalBuilder = protected ThreadLocal<StringBuilder> threadLocalBuilder =
@ -155,9 +161,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
writer=null; writer=null;
} }
boolean avoidStaleNodes = (stats != null
&& stats.isAvoidingStaleDataNodesForWrite());
DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer, DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer,
excludedNodes, blocksize, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
maxNodesPerRack, results);
if (!returnChosenNodes) { if (!returnChosenNodes) {
results.removeAll(chosenNodes); results.removeAll(chosenNodes);
} }
@ -173,8 +180,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
HashMap<Node, Node> excludedNodes, HashMap<Node, Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results) { List<DatanodeDescriptor> results,
final boolean avoidStaleNodes) {
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
return writer; return writer;
} }
@ -186,17 +193,20 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
writer = results.get(0); writer = results.get(0);
} }
// Keep a copy of original excludedNodes
final HashMap<Node, Node> oldExcludedNodes = avoidStaleNodes ?
new HashMap<Node, Node>(excludedNodes) : null;
try { try {
if (numOfResults == 0) { if (numOfResults == 0) {
writer = chooseLocalNode(writer, excludedNodes, writer = chooseLocalNode(writer, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
if (--numOfReplicas == 0) { if (--numOfReplicas == 0) {
return writer; return writer;
} }
} }
if (numOfResults <= 1) { if (numOfResults <= 1) {
chooseRemoteRack(1, results.get(0), excludedNodes, chooseRemoteRack(1, results.get(0), excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
if (--numOfReplicas == 0) { if (--numOfReplicas == 0) {
return writer; return writer;
} }
@ -204,24 +214,36 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
if (numOfResults <= 2) { if (numOfResults <= 2) {
if (clusterMap.isOnSameRack(results.get(0), results.get(1))) { if (clusterMap.isOnSameRack(results.get(0), results.get(1))) {
chooseRemoteRack(1, results.get(0), excludedNodes, chooseRemoteRack(1, results.get(0), excludedNodes,
blocksize, maxNodesPerRack, results); blocksize, maxNodesPerRack,
results, avoidStaleNodes);
} else if (newBlock){ } else if (newBlock){
chooseLocalRack(results.get(1), excludedNodes, blocksize, chooseLocalRack(results.get(1), excludedNodes, blocksize,
maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} else { } else {
chooseLocalRack(writer, excludedNodes, blocksize, chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack,
maxNodesPerRack, results); results, avoidStaleNodes);
} }
if (--numOfReplicas == 0) { if (--numOfReplicas == 0) {
return writer; return writer;
} }
} }
chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} catch (NotEnoughReplicasException e) { } catch (NotEnoughReplicasException e) {
LOG.warn("Not able to place enough replicas, still in need of " LOG.warn("Not able to place enough replicas, still in need of "
+ numOfReplicas + " to reach " + totalReplicasExpected + "\n" + numOfReplicas + " to reach " + totalReplicasExpected + "\n"
+ e.getMessage()); + e.getMessage());
if (avoidStaleNodes) {
// ecxludedNodes now has - initial excludedNodes, any nodes that were
// chosen and nodes that were tried but were not chosen because they
// were stale, decommissioned or for any other reason a node is not
// chosen for write. Retry again now not avoiding stale node
for (Node node : results) {
oldExcludedNodes.put(node, node);
}
return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize,
maxNodesPerRack, results, false);
}
} }
return writer; return writer;
} }
@ -236,26 +258,27 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
HashMap<Node, Node> excludedNodes, HashMap<Node, Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results) List<DatanodeDescriptor> results,
boolean avoidStaleNodes)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
// if no local machine, randomly choose one node // if no local machine, randomly choose one node
if (localMachine == null) if (localMachine == null)
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
if (preferLocalNode) { if (preferLocalNode) {
// otherwise try local machine first // otherwise try local machine first
Node oldNode = excludedNodes.put(localMachine, localMachine); Node oldNode = excludedNodes.put(localMachine, localMachine);
if (oldNode == null) { // was not in the excluded list if (oldNode == null) { // was not in the excluded list
if (isGoodTarget(localMachine, blocksize, if (isGoodTarget(localMachine, blocksize, maxNodesPerRack, false,
maxNodesPerRack, false, results)) { results, avoidStaleNodes)) {
results.add(localMachine); results.add(localMachine);
return localMachine; return localMachine;
} }
} }
} }
// try a node on local rack // try a node on local rack
return chooseLocalRack(localMachine, excludedNodes, return chooseLocalRack(localMachine, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
/* choose one node from the rack that <i>localMachine</i> is on. /* choose one node from the rack that <i>localMachine</i> is on.
@ -270,19 +293,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
HashMap<Node, Node> excludedNodes, HashMap<Node, Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results) List<DatanodeDescriptor> results,
boolean avoidStaleNodes)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
// no local machine, so choose a random machine // no local machine, so choose a random machine
if (localMachine == null) { if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
// choose one from the local rack // choose one from the local rack
try { try {
return chooseRandom( return chooseRandom(localMachine.getNetworkLocation(), excludedNodes,
localMachine.getNetworkLocation(), blocksize, maxNodesPerRack, results, avoidStaleNodes);
excludedNodes, blocksize, maxNodesPerRack, results);
} catch (NotEnoughReplicasException e1) { } catch (NotEnoughReplicasException e1) {
// find the second replica // find the second replica
DatanodeDescriptor newLocal=null; DatanodeDescriptor newLocal=null;
@ -296,18 +319,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
if (newLocal != null) { if (newLocal != null) {
try { try {
return chooseRandom( return chooseRandom(newLocal.getNetworkLocation(), excludedNodes,
newLocal.getNetworkLocation(), blocksize, maxNodesPerRack, results, avoidStaleNodes);
excludedNodes, blocksize, maxNodesPerRack, results);
} catch(NotEnoughReplicasException e2) { } catch(NotEnoughReplicasException e2) {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
} else { } else {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
} }
} }
@ -323,17 +345,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
HashMap<Node, Node> excludedNodes, HashMap<Node, Node> excludedNodes,
long blocksize, long blocksize,
int maxReplicasPerRack, int maxReplicasPerRack,
List<DatanodeDescriptor> results) List<DatanodeDescriptor> results,
boolean avoidStaleNodes)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
int oldNumOfReplicas = results.size(); int oldNumOfReplicas = results.size();
// randomly choose one node from remote racks // randomly choose one node from remote racks
try { try {
chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(), chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(),
excludedNodes, blocksize, maxReplicasPerRack, results); excludedNodes, blocksize, maxReplicasPerRack, results,
avoidStaleNodes);
} catch (NotEnoughReplicasException e) { } catch (NotEnoughReplicasException e) {
chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas), chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
localMachine.getNetworkLocation(), excludedNodes, blocksize, localMachine.getNetworkLocation(), excludedNodes, blocksize,
maxReplicasPerRack, results); maxReplicasPerRack, results, avoidStaleNodes);
} }
} }
@ -345,7 +369,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
HashMap<Node, Node> excludedNodes, HashMap<Node, Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results) List<DatanodeDescriptor> results,
boolean avoidStaleNodes)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
int numOfAvailableNodes = int numOfAvailableNodes =
clusterMap.countNumOfAvailableNodes(nodes, excludedNodes.keySet()); clusterMap.countNumOfAvailableNodes(nodes, excludedNodes.keySet());
@ -363,7 +388,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
Node oldNode = excludedNodes.put(chosenNode, chosenNode); Node oldNode = excludedNodes.put(chosenNode, chosenNode);
if (oldNode == null) { // chosenNode was not in the excluded list if (oldNode == null) { // chosenNode was not in the excluded list
numOfAvailableNodes--; numOfAvailableNodes--;
if (isGoodTarget(chosenNode, blocksize, maxNodesPerRack, results)) { if (isGoodTarget(chosenNode, blocksize,
maxNodesPerRack, results, avoidStaleNodes)) {
results.add(chosenNode); results.add(chosenNode);
adjustExcludedNodes(excludedNodes, chosenNode); adjustExcludedNodes(excludedNodes, chosenNode);
return chosenNode; return chosenNode;
@ -390,7 +416,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
HashMap<Node, Node> excludedNodes, HashMap<Node, Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results) List<DatanodeDescriptor> results,
boolean avoidStaleNodes)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
int numOfAvailableNodes = int numOfAvailableNodes =
@ -409,7 +436,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
if (oldNode == null) { if (oldNode == null) {
numOfAvailableNodes--; numOfAvailableNodes--;
if (isGoodTarget(chosenNode, blocksize, maxNodesPerRack, results)) { if (isGoodTarget(chosenNode, blocksize,
maxNodesPerRack, results, avoidStaleNodes)) {
numOfReplicas--; numOfReplicas--;
results.add(chosenNode); results.add(chosenNode);
adjustExcludedNodes(excludedNodes, chosenNode); adjustExcludedNodes(excludedNodes, chosenNode);
@ -451,9 +479,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
*/ */
private boolean isGoodTarget(DatanodeDescriptor node, private boolean isGoodTarget(DatanodeDescriptor node,
long blockSize, int maxTargetPerRack, long blockSize, int maxTargetPerRack,
List<DatanodeDescriptor> results) { List<DatanodeDescriptor> results,
return isGoodTarget(node, blockSize, maxTargetPerRack, boolean avoidStaleNodes) {
this.considerLoad, results); return isGoodTarget(node, blockSize, maxTargetPerRack, this.considerLoad,
results, avoidStaleNodes);
} }
/** /**
@ -467,6 +496,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
* @param considerLoad whether or not to consider load of the target node * @param considerLoad whether or not to consider load of the target node
* @param results A list containing currently chosen nodes. Used to check if * @param results A list containing currently chosen nodes. Used to check if
* too many nodes has been chosen in the target rack. * too many nodes has been chosen in the target rack.
* @param avoidStaleNodes Whether or not to avoid choosing stale nodes
* @return Return true if <i>node</i> has enough space, * @return Return true if <i>node</i> has enough space,
* does not have too much load, * does not have too much load,
* and the rack does not have too many nodes. * and the rack does not have too many nodes.
@ -474,7 +504,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
protected boolean isGoodTarget(DatanodeDescriptor node, protected boolean isGoodTarget(DatanodeDescriptor node,
long blockSize, int maxTargetPerRack, long blockSize, int maxTargetPerRack,
boolean considerLoad, boolean considerLoad,
List<DatanodeDescriptor> results) { List<DatanodeDescriptor> results,
boolean avoidStaleNodes) {
// check if the node is (being) decommissed // check if the node is (being) decommissed
if (node.isDecommissionInProgress() || node.isDecommissioned()) { if (node.isDecommissionInProgress() || node.isDecommissioned()) {
if(LOG.isDebugEnabled()) { if(LOG.isDebugEnabled()) {
@ -485,6 +516,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
return false; return false;
} }
if (avoidStaleNodes) {
if (node.isStale(this.staleInterval)) {
if (LOG.isDebugEnabled()) {
threadLocalBuilder.get().append(node.toString()).append(": ")
.append("Node ").append(NodeBase.getPath(node))
.append(" is not chosen because the node is staled ");
}
return false;
}
}
long remaining = node.getRemaining() - long remaining = node.getRemaining() -
(node.getBlocksScheduled() * blockSize); (node.getBlocksScheduled() * blockSize);
// check the remaining capacity of the target machine // check the remaining capacity of the target machine

View File

@ -64,23 +64,20 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
* @return the chosen node * @return the chosen node
*/ */
@Override @Override
protected DatanodeDescriptor chooseLocalNode( protected DatanodeDescriptor chooseLocalNode(DatanodeDescriptor localMachine,
DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
HashMap<Node, Node> excludedNodes, List<DatanodeDescriptor> results, boolean avoidStaleNodes)
long blocksize,
int maxNodesPerRack,
List<DatanodeDescriptor> results)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
// if no local machine, randomly choose one node // if no local machine, randomly choose one node
if (localMachine == null) if (localMachine == null)
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes,
blocksize, maxNodesPerRack, results); blocksize, maxNodesPerRack, results, avoidStaleNodes);
// otherwise try local machine first // otherwise try local machine first
Node oldNode = excludedNodes.put(localMachine, localMachine); Node oldNode = excludedNodes.put(localMachine, localMachine);
if (oldNode == null) { // was not in the excluded list if (oldNode == null) { // was not in the excluded list
if (isGoodTarget(localMachine, blocksize, if (isGoodTarget(localMachine, blocksize,
maxNodesPerRack, false, results)) { maxNodesPerRack, false, results, avoidStaleNodes)) {
results.add(localMachine); results.add(localMachine);
// Nodes under same nodegroup should be excluded. // Nodes under same nodegroup should be excluded.
addNodeGroupToExcludedNodes(excludedNodes, addNodeGroupToExcludedNodes(excludedNodes,
@ -92,13 +89,13 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
// try a node on local node group // try a node on local node group
DatanodeDescriptor chosenNode = chooseLocalNodeGroup( DatanodeDescriptor chosenNode = chooseLocalNodeGroup(
(NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
blocksize, maxNodesPerRack, results); blocksize, maxNodesPerRack, results, avoidStaleNodes);
if (chosenNode != null) { if (chosenNode != null) {
return chosenNode; return chosenNode;
} }
// try a node on local rack // try a node on local rack
return chooseLocalRack(localMachine, excludedNodes, return chooseLocalRack(localMachine, excludedNodes,
blocksize, maxNodesPerRack, results); blocksize, maxNodesPerRack, results, avoidStaleNodes);
} }
@Override @Override
@ -119,17 +116,15 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
} }
@Override @Override
protected DatanodeDescriptor chooseLocalRack( protected DatanodeDescriptor chooseLocalRack(DatanodeDescriptor localMachine,
DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
HashMap<Node, Node> excludedNodes, List<DatanodeDescriptor> results, boolean avoidStaleNodes)
long blocksize,
int maxNodesPerRack,
List<DatanodeDescriptor> results)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
// no local machine, so choose a random machine // no local machine, so choose a random machine
if (localMachine == null) { if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes,
blocksize, maxNodesPerRack, results); blocksize, maxNodesPerRack, results,
avoidStaleNodes);
} }
// choose one from the local rack, but off-nodegroup // choose one from the local rack, but off-nodegroup
@ -137,7 +132,8 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
return chooseRandom(NetworkTopology.getFirstHalf( return chooseRandom(NetworkTopology.getFirstHalf(
localMachine.getNetworkLocation()), localMachine.getNetworkLocation()),
excludedNodes, blocksize, excludedNodes, blocksize,
maxNodesPerRack, results); maxNodesPerRack, results,
avoidStaleNodes);
} catch (NotEnoughReplicasException e1) { } catch (NotEnoughReplicasException e1) {
// find the second replica // find the second replica
DatanodeDescriptor newLocal=null; DatanodeDescriptor newLocal=null;
@ -151,39 +147,39 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
} }
if (newLocal != null) { if (newLocal != null) {
try { try {
return chooseRandom(clusterMap.getRack(newLocal.getNetworkLocation()), return chooseRandom(
excludedNodes, blocksize, maxNodesPerRack, results); clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes);
} catch(NotEnoughReplicasException e2) { } catch(NotEnoughReplicasException e2) {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
} else { } else {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
} }
} }
@Override @Override
protected void chooseRemoteRack(int numOfReplicas, protected void chooseRemoteRack(int numOfReplicas,
DatanodeDescriptor localMachine, DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes,
HashMap<Node, Node> excludedNodes, long blocksize, int maxReplicasPerRack, List<DatanodeDescriptor> results,
long blocksize, boolean avoidStaleNodes) throws NotEnoughReplicasException {
int maxReplicasPerRack,
List<DatanodeDescriptor> results)
throws NotEnoughReplicasException {
int oldNumOfReplicas = results.size(); int oldNumOfReplicas = results.size();
// randomly choose one node from remote racks // randomly choose one node from remote racks
try { try {
chooseRandom(numOfReplicas, "~"+NetworkTopology.getFirstHalf( chooseRandom(
localMachine.getNetworkLocation()), numOfReplicas,
excludedNodes, blocksize, maxReplicasPerRack, results); "~" + NetworkTopology.getFirstHalf(localMachine.getNetworkLocation()),
excludedNodes, blocksize, maxReplicasPerRack, results,
avoidStaleNodes);
} catch (NotEnoughReplicasException e) { } catch (NotEnoughReplicasException e) {
chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas), chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
localMachine.getNetworkLocation(), excludedNodes, blocksize, localMachine.getNetworkLocation(), excludedNodes, blocksize,
maxReplicasPerRack, results); maxReplicasPerRack, results, avoidStaleNodes);
} }
} }
@ -193,19 +189,22 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
* if still no such node is available, choose a random node in the cluster. * if still no such node is available, choose a random node in the cluster.
* @return the chosen node * @return the chosen node
*/ */
private DatanodeDescriptor chooseLocalNodeGroup(NetworkTopologyWithNodeGroup clusterMap, private DatanodeDescriptor chooseLocalNodeGroup(
DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, NetworkTopologyWithNodeGroup clusterMap, DatanodeDescriptor localMachine,
int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
List<DatanodeDescriptor> results, boolean avoidStaleNodes)
throws NotEnoughReplicasException {
// no local machine, so choose a random machine // no local machine, so choose a random machine
if (localMachine == null) { if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes,
blocksize, maxNodesPerRack, results); blocksize, maxNodesPerRack, results, avoidStaleNodes);
} }
// choose one from the local node group // choose one from the local node group
try { try {
return chooseRandom(clusterMap.getNodeGroup(localMachine.getNetworkLocation()), return chooseRandom(
excludedNodes, blocksize, maxNodesPerRack, results); clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
} catch (NotEnoughReplicasException e1) { } catch (NotEnoughReplicasException e1) {
// find the second replica // find the second replica
DatanodeDescriptor newLocal=null; DatanodeDescriptor newLocal=null;
@ -219,17 +218,19 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
} }
if (newLocal != null) { if (newLocal != null) {
try { try {
return chooseRandom(clusterMap.getNodeGroup(newLocal.getNetworkLocation()), return chooseRandom(
excludedNodes, blocksize, maxNodesPerRack, results); clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
excludedNodes, blocksize, maxNodesPerRack, results,
avoidStaleNodes);
} catch(NotEnoughReplicasException e2) { } catch(NotEnoughReplicasException e2) {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
} else { } else {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results); maxNodesPerRack, results, avoidStaleNodes);
} }
} }
} }

View File

@ -76,6 +76,7 @@ import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.net.InetAddresses; import com.google.common.net.InetAddresses;
/** /**
@ -88,8 +89,8 @@ public class DatanodeManager {
private final Namesystem namesystem; private final Namesystem namesystem;
private final BlockManager blockManager; private final BlockManager blockManager;
private final HeartbeatManager heartbeatManager; private final HeartbeatManager heartbeatManager;
private Daemon decommissionthread = null;
/** /**
* Stores the datanode -> block map. * Stores the datanode -> block map.
@ -127,28 +128,33 @@ public class DatanodeManager {
/** Ask Datanode only up to this many blocks to delete. */ /** Ask Datanode only up to this many blocks to delete. */
final int blockInvalidateLimit; final int blockInvalidateLimit;
/** Whether or not to check stale DataNodes for read/write */
private final boolean checkForStaleDataNodes;
/** The interval for judging stale DataNodes for read/write */
private final long staleInterval;
/** Whether or not to avoid using stale DataNodes for writing */
private volatile boolean avoidStaleDataNodesForWrite;
/** The number of stale DataNodes */
private volatile int numStaleNodes;
/** /**
* Whether or not this cluster has ever consisted of more than 1 rack, * Whether or not this cluster has ever consisted of more than 1 rack,
* according to the NetworkTopology. * according to the NetworkTopology.
*/ */
private boolean hasClusterEverBeenMultiRack = false; private boolean hasClusterEverBeenMultiRack = false;
/** Whether or not to check the stale datanodes */ DatanodeManager(final BlockManager blockManager, final Namesystem namesystem,
private volatile boolean checkForStaleNodes; final Configuration conf) throws IOException {
/** The time interval for detecting stale datanodes */
private volatile long staleInterval;
DatanodeManager(final BlockManager blockManager,
final Namesystem namesystem, final Configuration conf
) throws IOException {
this.namesystem = namesystem; this.namesystem = namesystem;
this.blockManager = blockManager; this.blockManager = blockManager;
Class<? extends NetworkTopology> networkTopologyClass = Class<? extends NetworkTopology> networkTopologyClass =
conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY, conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
NetworkTopology.class, NetworkTopology.class); NetworkTopology.class, NetworkTopology.class);
networktopology = (NetworkTopology) ReflectionUtils.newInstance( networktopology = ReflectionUtils.newInstance(networkTopologyClass, conf);
networkTopologyClass, conf);
this.heartbeatManager = new HeartbeatManager(namesystem, blockManager, conf); this.heartbeatManager = new HeartbeatManager(namesystem, blockManager, conf);
@ -181,24 +187,68 @@ public class DatanodeManager {
DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY, blockInvalidateLimit); DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY, blockInvalidateLimit);
LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY
+ "=" + this.blockInvalidateLimit); + "=" + this.blockInvalidateLimit);
// set the value of stale interval based on configuration
this.checkForStaleNodes = conf.getBoolean( checkForStaleDataNodes = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY,
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT); DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT);
if (this.checkForStaleNodes) {
this.staleInterval = conf.getLong( staleInterval = getStaleIntervalFromConf(conf, heartbeatExpireInterval);
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY, avoidStaleDataNodesForWrite = getAvoidStaleForWriteFromConf(conf,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT); checkForStaleDataNodes);
if (this.staleInterval < DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT) {
LOG.warn("The given interval for marking stale datanode = "
+ this.staleInterval + ", which is smaller than the default value "
+ DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT
+ ".");
}
}
} }
private Daemon decommissionthread = null; private static long getStaleIntervalFromConf(Configuration conf,
long heartbeatExpireInterval) {
long staleInterval = conf.getLong(
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);
Preconditions.checkArgument(staleInterval > 0,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY +
" = '" + staleInterval + "' is invalid. " +
"It should be a positive non-zero value.");
final long heartbeatIntervalSeconds = conf.getLong(
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT);
// The stale interval value cannot be smaller than
// 3 times of heartbeat interval
final long minStaleInterval = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_DEFAULT)
* heartbeatIntervalSeconds * 1000;
if (staleInterval < minStaleInterval) {
LOG.warn("The given interval for marking stale datanode = "
+ staleInterval + ", which is less than "
+ DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_DEFAULT
+ " heartbeat intervals. This may cause too frequent changes of "
+ "stale states of DataNodes since a heartbeat msg may be missing "
+ "due to temporary short-term failures. Reset stale interval to "
+ minStaleInterval + ".");
staleInterval = minStaleInterval;
}
if (staleInterval > heartbeatExpireInterval) {
LOG.warn("The given interval for marking stale datanode = "
+ staleInterval + ", which is larger than heartbeat expire interval "
+ heartbeatExpireInterval + ".");
}
return staleInterval;
}
static boolean getAvoidStaleForWriteFromConf(Configuration conf,
boolean checkForStale) {
boolean avoid = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY,
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT);
boolean avoidStaleDataNodesForWrite = checkForStale && avoid;
if (!checkForStale && avoid) {
LOG.warn("Cannot set "
+ DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY
+ " as false while setting "
+ DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY
+ " as true.");
}
return avoidStaleDataNodesForWrite;
}
void activate(final Configuration conf) { void activate(final Configuration conf) {
final DecommissionManager dm = new DecommissionManager(namesystem, blockManager); final DecommissionManager dm = new DecommissionManager(namesystem, blockManager);
@ -253,9 +303,10 @@ public class DatanodeManager {
client = new NodeBase(rName + NodeBase.PATH_SEPARATOR_STR + targethost); client = new NodeBase(rName + NodeBase.PATH_SEPARATOR_STR + targethost);
} }
Comparator<DatanodeInfo> comparator = checkForStaleNodes ? Comparator<DatanodeInfo> comparator = checkForStaleDataNodes ?
new DFSUtil.DecomStaleComparator(staleInterval) : new DFSUtil.DecomStaleComparator(staleInterval) :
DFSUtil.DECOM_COMPARATOR; DFSUtil.DECOM_COMPARATOR;
for (LocatedBlock b : locatedblocks) { for (LocatedBlock b : locatedblocks) {
networktopology.pseudoSortByDistance(client, b.getLocations()); networktopology.pseudoSortByDistance(client, b.getLocations());
// Move decommissioned/stale datanodes to the bottom // Move decommissioned/stale datanodes to the bottom
@ -723,7 +774,7 @@ public class DatanodeManager {
* 3. Added to exclude --> start decommission. * 3. Added to exclude --> start decommission.
* 4. Removed from exclude --> stop decommission. * 4. Removed from exclude --> stop decommission.
*/ */
private void refreshDatanodes() throws IOException { private void refreshDatanodes() {
for(DatanodeDescriptor node : datanodeMap.values()) { for(DatanodeDescriptor node : datanodeMap.values()) {
// Check if not include. // Check if not include.
if (!inHostsList(node)) { if (!inHostsList(node)) {
@ -783,6 +834,60 @@ public class DatanodeManager {
} }
} }
/* Getter and Setter for stale DataNodes related attributes */
/**
* @return whether or not to avoid writing to stale datanodes
*/
public boolean isAvoidingStaleDataNodesForWrite() {
return avoidStaleDataNodesForWrite;
}
/**
* Set the value of {@link DatanodeManager#avoidStaleDataNodesForWrite}.
* The HeartbeatManager disable avoidStaleDataNodesForWrite when more than
* half of the DataNodes are marked as stale.
*
* @param avoidStaleDataNodesForWrite
* The value to set to
* {@link DatanodeManager#avoidStaleDataNodesForWrite}
*/
void setAvoidStaleDataNodesForWrite(boolean avoidStaleDataNodesForWrite) {
this.avoidStaleDataNodesForWrite = avoidStaleDataNodesForWrite;
}
/**
* @return Whether or not to check stale DataNodes for R/W
*/
boolean isCheckingForStaleDataNodes() {
return checkForStaleDataNodes;
}
/**
* @return The time interval used to mark DataNodes as stale.
*/
long getStaleInterval() {
return staleInterval;
}
/**
* Set the number of current stale DataNodes. The HeartbeatManager got this
* number based on DataNodes' heartbeats.
*
* @param numStaleNodes
* The number of stale DataNodes to be set.
*/
void setNumStaleNodes(int numStaleNodes) {
this.numStaleNodes = numStaleNodes;
}
/**
* @return Return the current number of stale DataNodes (detected by
* HeartbeatManager).
*/
int getNumStaleNodes() {
return this.numStaleNodes;
}
/** Fetch live and dead datanodes. */ /** Fetch live and dead datanodes. */
public void fetchDatanodes(final List<DatanodeDescriptor> live, public void fetchDatanodes(final List<DatanodeDescriptor> live,
@ -961,7 +1066,7 @@ public class DatanodeManager {
return nodes; return nodes;
} }
private void setDatanodeDead(DatanodeDescriptor node) throws IOException { private void setDatanodeDead(DatanodeDescriptor node) {
node.setLastUpdate(0); node.setLastUpdate(0);
} }

View File

@ -30,6 +30,8 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import com.google.common.base.Preconditions;
/** /**
* Manage the heartbeats received from datanodes. * Manage the heartbeats received from datanodes.
* The datanode list and statistics are synchronized * The datanode list and statistics are synchronized
@ -54,18 +56,48 @@ class HeartbeatManager implements DatanodeStatistics {
private final long heartbeatRecheckInterval; private final long heartbeatRecheckInterval;
/** Heartbeat monitor thread */ /** Heartbeat monitor thread */
private final Daemon heartbeatThread = new Daemon(new Monitor()); private final Daemon heartbeatThread = new Daemon(new Monitor());
/**
* The initial setting of end user which indicates whether or not to avoid
* writing to stale datanodes.
*/
private final boolean initialAvoidWriteStaleNodes;
/**
* When the ratio of stale datanodes reaches this number, stop avoiding
* writing to stale datanodes, i.e., continue using stale nodes for writing.
*/
private final float ratioUseStaleDataNodesForWrite;
final Namesystem namesystem; final Namesystem namesystem;
final BlockManager blockManager; final BlockManager blockManager;
HeartbeatManager(final Namesystem namesystem, final BlockManager blockManager, HeartbeatManager(final Namesystem namesystem,
final Configuration conf) { final BlockManager blockManager, final Configuration conf) {
this.heartbeatRecheckInterval = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT); // 5 minutes
this.namesystem = namesystem; this.namesystem = namesystem;
this.blockManager = blockManager; this.blockManager = blockManager;
boolean checkStaleNodes = conf.getBoolean(
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY,
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT);
long recheckInterval = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT); // 5 min
long staleInterval = conf.getLong(
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);// 30s
this.initialAvoidWriteStaleNodes = DatanodeManager
.getAvoidStaleForWriteFromConf(conf, checkStaleNodes);
this.ratioUseStaleDataNodesForWrite = conf.getFloat(
DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY,
DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT);
Preconditions.checkArgument(
(ratioUseStaleDataNodesForWrite > 0 &&
ratioUseStaleDataNodesForWrite <= 1.0f),
DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY +
" = '" + ratioUseStaleDataNodesForWrite + "' is invalid. " +
"It should be a positive non-zero float value, not greater than 1.0f.");
this.heartbeatRecheckInterval = (checkStaleNodes
&& initialAvoidWriteStaleNodes
&& staleInterval < recheckInterval) ? staleInterval : recheckInterval;
} }
void activate(Configuration conf) { void activate(Configuration conf) {
@ -210,18 +242,41 @@ class HeartbeatManager implements DatanodeStatistics {
if (namesystem.isInSafeMode()) { if (namesystem.isInSafeMode()) {
return; return;
} }
boolean checkStaleNodes = dm.isCheckingForStaleDataNodes();
boolean allAlive = false; boolean allAlive = false;
while (!allAlive) { while (!allAlive) {
// locate the first dead node. // locate the first dead node.
DatanodeID dead = null; DatanodeID dead = null;
// check the number of stale nodes
int numOfStaleNodes = 0;
synchronized(this) { synchronized(this) {
for (DatanodeDescriptor d : datanodes) { for (DatanodeDescriptor d : datanodes) {
if (dm.isDatanodeDead(d)) { if (dead == null && dm.isDatanodeDead(d)) {
stats.incrExpiredHeartbeats(); stats.incrExpiredHeartbeats();
dead = d; dead = d;
if (!checkStaleNodes) {
break; break;
} }
} }
if (checkStaleNodes &&
d.isStale(dm.getStaleInterval())) {
numOfStaleNodes++;
}
}
// Change whether to avoid using stale datanodes for writing
// based on proportion of stale datanodes
if (checkStaleNodes) {
dm.setNumStaleNodes(numOfStaleNodes);
if (numOfStaleNodes >
datanodes.size() * ratioUseStaleDataNodesForWrite) {
dm.setAvoidStaleDataNodesForWrite(false);
} else {
if (this.initialAvoidWriteStaleNodes) {
dm.setAvoidStaleDataNodesForWrite(true);
}
}
}
} }
allAlive = dead == null; allAlive = dead == null;

View File

@ -251,7 +251,7 @@ public class DataNode extends Configured
Daemon dataXceiverServer = null; Daemon dataXceiverServer = null;
ThreadGroup threadGroup = null; ThreadGroup threadGroup = null;
private DNConf dnConf; private DNConf dnConf;
private boolean heartbeatsDisabledForTests = false; private volatile boolean heartbeatsDisabledForTests = false;
private DataStorage storage = null; private DataStorage storage = null;
private HttpServer infoServer = null; private HttpServer infoServer = null;
DataNodeMetrics metrics; DataNodeMetrics metrics;

View File

@ -32,8 +32,16 @@ public interface FSClusterStats {
* @return a count of the total number of block transfers and block * @return a count of the total number of block transfers and block
* writes that are currently occuring on the cluster. * writes that are currently occuring on the cluster.
*/ */
public int getTotalLoad(); public int getTotalLoad();
/**
* Indicate whether or not the cluster is now avoiding
* to use stale DataNodes for writing.
*
* @return True if the cluster is currently avoiding using stale DataNodes
* for writing targets, and false otherwise.
*/
public boolean isAvoidingStaleDataNodesForWrite();
} }

View File

@ -5539,4 +5539,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) { public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
this.nnResourceChecker = nnResourceChecker; this.nnResourceChecker = nnResourceChecker;
} }
@Override
public boolean isAvoidingStaleDataNodesForWrite() {
return this.blockManager.getDatanodeManager()
.isAvoidingStaleDataNodesForWrite();
}
} }

View File

@ -991,9 +991,25 @@
Indicate whether or not to check "stale" datanodes whose Indicate whether or not to check "stale" datanodes whose
heartbeat messages have not been received by the namenode heartbeat messages have not been received by the namenode
for more than a specified time interval. If this configuration for more than a specified time interval. If this configuration
parameter is set as true, the stale datanodes will be moved to parameter is set as true, the system will keep track
the end of the target node list for reading. The writing will of the number of stale datanodes. The stale datanodes will be
also try to avoid stale nodes. moved to the end of the node list returned for reading. See
dfs.namenode.avoid.write.stale.datanode for details on how this
affects writes.
</description>
</property>
<property>
<name>dfs.namenode.avoid.write.stale.datanode</name>
<value>false</value>
<description>
Indicate whether or not to avoid writing to "stale" datanodes whose
heartbeat messages have not been received by the namenode
for more than a specified time interval. If this configuration
parameter and dfs.namenode.check.stale.datanode are both set as true,
the writing will avoid using stale datanodes unless a high number
of datanodes are marked as stale. See
dfs.namenode.write.stale.datanode.ratio for details.
</description> </description>
</property> </property>
@ -1004,7 +1020,21 @@
Default time interval for marking a datanode as "stale", i.e., if Default time interval for marking a datanode as "stale", i.e., if
the namenode has not received heartbeat msg from a datanode for the namenode has not received heartbeat msg from a datanode for
more than this time interval, the datanode will be marked and treated more than this time interval, the datanode will be marked and treated
as "stale" by default. as "stale" by default. The stale interval cannot be too small since
otherwise this may cause too frequent change of stale states.
We thus set a minimum stale interval value (the default value is 3 times
of heartbeat interval) and guarantee that the stale interval cannot be less
than the minimum value.
</description>
</property>
<property>
<name>dfs.namenode.write.stale.datanode.ratio</name>
<value>0.5f</value>
<description>
When the ratio of number stale datanodes to total datanodes marked
is greater than this ratio, stop avoiding writing to stale nodes so
as to prevent causing hotspots.
</description> </description>
</property> </property>

View File

@ -38,9 +38,12 @@ import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node; import org.apache.hadoop.net.Node;
import org.apache.hadoop.util.Time;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
@ -55,6 +58,9 @@ public class TestReplicationPolicy {
private static BlockPlacementPolicy replicator; private static BlockPlacementPolicy replicator;
private static final String filename = "/dummyfile.txt"; private static final String filename = "/dummyfile.txt";
private static DatanodeDescriptor dataNodes[]; private static DatanodeDescriptor dataNodes[];
// The interval for marking a datanode as stale,
private static long staleInterval =
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT;
@Rule @Rule
public ExpectedException exception = ExpectedException.none(); public ExpectedException exception = ExpectedException.none();
@ -77,6 +83,8 @@ public class TestReplicationPolicy {
"test.build.data", "build/test/data"), "dfs/"); "test.build.data", "build/test/data"), "dfs/");
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY, conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
new File(baseDir, "name").getPath()); new File(baseDir, "name").getPath());
// Enable the checking for stale datanodes in the beginning
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, true);
DFSTestUtil.formatNameNode(conf); DFSTestUtil.formatNameNode(conf);
namenode = new NameNode(conf); namenode = new NameNode(conf);
@ -370,6 +378,202 @@ public class TestReplicationPolicy {
assertFalse(cluster.isOnSameRack(targets[0], targets[1])); assertFalse(cluster.isOnSameRack(targets[0], targets[1]));
} }
private boolean containsWithinRange(DatanodeDescriptor target,
DatanodeDescriptor[] nodes, int startIndex, int endIndex) {
assert startIndex >= 0 && startIndex < nodes.length;
assert endIndex >= startIndex && endIndex < nodes.length;
for (int i = startIndex; i <= endIndex; i++) {
if (nodes[i].equals(target)) {
return true;
}
}
return false;
}
@Test
public void testChooseTargetWithStaleNodes() throws Exception {
// Enable avoidng writing to stale datanodes
namenode.getNamesystem().getBlockManager().getDatanodeManager()
.setAvoidStaleDataNodesForWrite(true);
// Set dataNodes[0] as stale
dataNodes[0].setLastUpdate(Time.now() - staleInterval - 1);
DatanodeDescriptor[] targets;
// We set the datanode[0] as stale, thus should choose datanode[1] since
// datanode[1] is on the same rack with datanode[0] (writer)
targets = replicator.chooseTarget(filename, 1, dataNodes[0],
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 1);
assertEquals(targets[0], dataNodes[1]);
HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>();
excludedNodes.put(dataNodes[1], dataNodes[1]);
List<DatanodeDescriptor> chosenNodes = new ArrayList<DatanodeDescriptor>();
BlockPlacementPolicyDefault repl = (BlockPlacementPolicyDefault)replicator;
targets = chooseTarget(repl, 1, dataNodes[0], chosenNodes, excludedNodes,
BLOCK_SIZE);
assertEquals(targets.length, 1);
assertFalse(cluster.isOnSameRack(targets[0], dataNodes[0]));
// reset
namenode.getNamesystem().getBlockManager().getDatanodeManager()
.setAvoidStaleDataNodesForWrite(false);
dataNodes[0].setLastUpdate(Time.now());
}
/**
* In this testcase, we set 3 nodes (dataNodes[0] ~ dataNodes[2]) as stale,
* and when the number of replicas is less or equal to 3, all the healthy
* datanodes should be returned by the chooseTarget method. When the number
* of replicas is 4, a stale node should be included.
*
* @throws Exception
*/
@Test
public void testChooseTargetWithHalfStaleNodes() throws Exception {
// Enable stale datanodes checking
namenode.getNamesystem().getBlockManager().getDatanodeManager()
.setAvoidStaleDataNodesForWrite(true);
// Set dataNodes[0], dataNodes[1], and dataNodes[2] as stale
for (int i = 0; i < 3; i++) {
dataNodes[i].setLastUpdate(Time.now() - staleInterval - 1);
}
DatanodeDescriptor[] targets;
targets = replicator.chooseTarget(filename, 0, dataNodes[0],
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 0);
// We set the datanode[0] as stale, thus should choose datanode[1]
targets = replicator.chooseTarget(filename, 1, dataNodes[0],
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 1);
assertFalse(containsWithinRange(targets[0], dataNodes, 0, 2));
targets = replicator.chooseTarget(filename, 2, dataNodes[0],
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 2);
assertFalse(containsWithinRange(targets[0], dataNodes, 0, 2));
assertFalse(containsWithinRange(targets[1], dataNodes, 0, 2));
targets = replicator.chooseTarget(filename, 3, dataNodes[0],
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 3);
assertTrue(containsWithinRange(targets[0], dataNodes, 3, 5));
assertTrue(containsWithinRange(targets[1], dataNodes, 3, 5));
assertTrue(containsWithinRange(targets[2], dataNodes, 3, 5));
targets = replicator.chooseTarget(filename, 4, dataNodes[0],
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 4);
assertTrue(containsWithinRange(dataNodes[3], targets, 0, 3));
assertTrue(containsWithinRange(dataNodes[4], targets, 0, 3));
assertTrue(containsWithinRange(dataNodes[5], targets, 0, 3));
// reset
namenode.getNamesystem().getBlockManager().getDatanodeManager()
.setAvoidStaleDataNodesForWrite(false);
for (int i = 0; i < dataNodes.length; i++) {
dataNodes[i].setLastUpdate(Time.now());
}
}
@Test
public void testChooseTargetWithMoreThanHalfStaleNodes() throws Exception {
HdfsConfiguration conf = new HdfsConfiguration();
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, true);
conf.setBoolean(
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY, true);
String[] hosts = new String[]{"host1", "host2", "host3",
"host4", "host5", "host6"};
String[] racks = new String[]{"/d1/r1", "/d1/r1", "/d1/r2",
"/d1/r2", "/d2/r3", "/d2/r3"};
MiniDFSCluster miniCluster = new MiniDFSCluster.Builder(conf).racks(racks)
.hosts(hosts).numDataNodes(hosts.length).build();
miniCluster.waitActive();
try {
// Step 1. Make two datanodes as stale, check whether the
// avoidStaleDataNodesForWrite calculation is correct.
// First stop the heartbeat of host1 and host2
for (int i = 0; i < 2; i++) {
DataNode dn = miniCluster.getDataNodes().get(i);
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true);
miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getDatanode(dn.getDatanodeId())
.setLastUpdate(Time.now() - staleInterval - 1);
}
// Instead of waiting, explicitly call heartbeatCheck to
// let heartbeat manager to detect stale nodes
miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
int numStaleNodes = miniCluster.getNameNode().getNamesystem()
.getBlockManager().getDatanodeManager().getNumStaleNodes();
assertEquals(numStaleNodes, 2);
assertTrue(miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().isAvoidingStaleDataNodesForWrite());
// Call chooseTarget
DatanodeDescriptor staleNodeInfo = miniCluster.getNameNode()
.getNamesystem().getBlockManager().getDatanodeManager()
.getDatanode(miniCluster.getDataNodes().get(0).getDatanodeId());
BlockPlacementPolicy replicator = miniCluster.getNameNode()
.getNamesystem().getBlockManager().getBlockPlacementPolicy();
DatanodeDescriptor[] targets = replicator.chooseTarget(filename, 3,
staleNodeInfo, new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 3);
assertFalse(cluster.isOnSameRack(targets[0], staleNodeInfo));
// Step 2. Set more than half of the datanodes as stale
for (int i = 0; i < 4; i++) {
DataNode dn = miniCluster.getDataNodes().get(i);
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true);
miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getDatanode(dn.getDatanodeId())
.setLastUpdate(Time.now() - staleInterval - 1);
}
// Explicitly call heartbeatCheck
miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
numStaleNodes = miniCluster.getNameNode().getNamesystem()
.getBlockManager().getDatanodeManager().getNumStaleNodes();
assertEquals(numStaleNodes, 4);
// According to our strategy, stale datanodes will be included for writing
// to avoid hotspots
assertFalse(miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().isAvoidingStaleDataNodesForWrite());
// Call chooseTarget
targets = replicator.chooseTarget(filename, 3,
staleNodeInfo, new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 3);
assertTrue(cluster.isOnSameRack(targets[0], staleNodeInfo));
// Step 3. Set 2 stale datanodes back to healthy nodes,
// still have 2 stale nodes
for (int i = 2; i < 4; i++) {
DataNode dn = miniCluster.getDataNodes().get(i);
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false);
miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getDatanode(dn.getDatanodeId())
.setLastUpdate(Time.now());
}
// Explicitly call heartbeatCheck
miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
numStaleNodes = miniCluster.getNameNode().getNamesystem()
.getBlockManager().getDatanodeManager().getNumStaleNodes();
assertEquals(numStaleNodes, 2);
assertTrue(miniCluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().isAvoidingStaleDataNodesForWrite());
// Call chooseTarget
targets = replicator.chooseTarget(filename, 3,
staleNodeInfo, new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
assertEquals(targets.length, 3);
assertFalse(cluster.isOnSameRack(targets[0], staleNodeInfo));
} finally {
miniCluster.shutdown();
}
}
/** /**
* This testcase tests re-replication, when dataNodes[0] is already chosen. * This testcase tests re-replication, when dataNodes[0] is already chosen.
* So the 1st replica can be placed on random rack. * So the 1st replica can be placed on random rack.
@ -490,8 +694,8 @@ public class TestReplicationPolicy {
.format(true).build(); .format(true).build();
try { try {
cluster.waitActive(); cluster.waitActive();
final UnderReplicatedBlocks neededReplications = (UnderReplicatedBlocks) cluster final UnderReplicatedBlocks neededReplications = cluster.getNameNode()
.getNameNode().getNamesystem().getBlockManager().neededReplications; .getNamesystem().getBlockManager().neededReplications;
for (int i = 0; i < 100; i++) { for (int i = 0; i < 100; i++) {
// Adding the blocks directly to normal priority // Adding the blocks directly to normal priority
neededReplications.add(new Block(random.nextLong()), 2, 0, 3); neededReplications.add(new Block(random.nextLong()), 2, 0, 3);
@ -529,10 +733,10 @@ public class TestReplicationPolicy {
// Adding QUEUE_VERY_UNDER_REPLICATED block // Adding QUEUE_VERY_UNDER_REPLICATED block
underReplicatedBlocks.add(new Block(random.nextLong()), 2, 0, 7); underReplicatedBlocks.add(new Block(random.nextLong()), 2, 0, 7);
// Adding QUEUE_UNDER_REPLICATED block // Adding QUEUE_REPLICAS_BADLY_DISTRIBUTED block
underReplicatedBlocks.add(new Block(random.nextLong()), 6, 0, 6); underReplicatedBlocks.add(new Block(random.nextLong()), 6, 0, 6);
// Adding QUEUE_REPLICAS_BADLY_DISTRIBUTED block // Adding QUEUE_UNDER_REPLICATED block
underReplicatedBlocks.add(new Block(random.nextLong()), 5, 0, 6); underReplicatedBlocks.add(new Block(random.nextLong()), 5, 0, 6);
// Adding QUEUE_WITH_CORRUPT_BLOCKS block // Adding QUEUE_WITH_CORRUPT_BLOCKS block
@ -618,6 +822,11 @@ public class TestReplicationPolicy {
dataNodes[5].setRemaining(1*1024*1024); dataNodes[5].setRemaining(1*1024*1024);
replicaNodeList.add(dataNodes[5]); replicaNodeList.add(dataNodes[5]);
// Refresh the last update time for all the datanodes
for (int i = 0; i < dataNodes.length; i++) {
dataNodes[i].setLastUpdate(Time.now());
}
List<DatanodeDescriptor> first = new ArrayList<DatanodeDescriptor>(); List<DatanodeDescriptor> first = new ArrayList<DatanodeDescriptor>();
List<DatanodeDescriptor> second = new ArrayList<DatanodeDescriptor>(); List<DatanodeDescriptor> second = new ArrayList<DatanodeDescriptor>();
replicator.splitNodesWithRack( replicator.splitNodesWithRack(