HDFS-3912. Detect and avoid stale datanodes for writes. Contributed by Jing Zhao
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1397211 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f174adfc15
commit
2887bbb33c
|
@ -13,10 +13,6 @@ Trunk (Unreleased)
|
||||||
HDFS-3601. Add BlockPlacementPolicyWithNodeGroup to support block placement
|
HDFS-3601. Add BlockPlacementPolicyWithNodeGroup to support block placement
|
||||||
with 4-layer network topology. (Junping Du via szetszwo)
|
with 4-layer network topology. (Junping Du via szetszwo)
|
||||||
|
|
||||||
HDFS-3703. Datanodes are marked stale if heartbeat is not received in
|
|
||||||
configured timeout and are selected as the last location to read from.
|
|
||||||
(Jing Zhao via suresh)
|
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
|
|
||||||
HDFS-1620. Rename HdfsConstants -> HdfsServerConstants, FSConstants ->
|
HDFS-1620. Rename HdfsConstants -> HdfsServerConstants, FSConstants ->
|
||||||
|
@ -238,6 +234,9 @@ Release 2.0.3-alpha - Unreleased
|
||||||
HDFS-2656. Add libwebhdfs, a pure C client based on WebHDFS.
|
HDFS-2656. Add libwebhdfs, a pure C client based on WebHDFS.
|
||||||
(Jaimin D Jetly and Jing Zhao via szetszwo)
|
(Jaimin D Jetly and Jing Zhao via szetszwo)
|
||||||
|
|
||||||
|
HDFS-3912. Detect and avoid stale datanodes for writes.
|
||||||
|
(Jing Zhao via suresh)
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
|
|
||||||
HDFS-3925. Prettify PipelineAck#toString() for printing to a log
|
HDFS-3925. Prettify PipelineAck#toString() for printing to a log
|
||||||
|
@ -349,6 +348,11 @@ Release 2.0.2-alpha - 2012-09-07
|
||||||
|
|
||||||
HDFS-2793. Add an admin command to trigger an edit log roll. (todd)
|
HDFS-2793. Add an admin command to trigger an edit log roll. (todd)
|
||||||
|
|
||||||
|
HDFS-3703. Datanodes are marked stale if heartbeat is not received in
|
||||||
|
configured timeout and are selected as the last location to read from.
|
||||||
|
(Jing Zhao via suresh)
|
||||||
|
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
|
|
||||||
HDFS-3390. DFSAdmin should print full stack traces of errors when DEBUG
|
HDFS-3390. DFSAdmin should print full stack traces of errors when DEBUG
|
||||||
|
|
|
@ -180,9 +180,21 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
||||||
// Whether to enable datanode's stale state detection and usage
|
// Whether to enable datanode's stale state detection and usage
|
||||||
public static final String DFS_NAMENODE_CHECK_STALE_DATANODE_KEY = "dfs.namenode.check.stale.datanode";
|
public static final String DFS_NAMENODE_CHECK_STALE_DATANODE_KEY = "dfs.namenode.check.stale.datanode";
|
||||||
public static final boolean DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT = false;
|
public static final boolean DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT = false;
|
||||||
|
// Whether to enable datanode's stale state detection and usage
|
||||||
|
public static final String DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY = "dfs.namenode.avoid.write.stale.datanode";
|
||||||
|
public static final boolean DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT = false;
|
||||||
// The default value of the time interval for marking datanodes as stale
|
// The default value of the time interval for marking datanodes as stale
|
||||||
public static final String DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY = "dfs.namenode.stale.datanode.interval";
|
public static final String DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY = "dfs.namenode.stale.datanode.interval";
|
||||||
public static final long DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT = 30 * 1000; // 30s
|
public static final long DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT = 30 * 1000; // 30s
|
||||||
|
// The stale interval cannot be too small since otherwise this may cause too frequent churn on stale states.
|
||||||
|
// This value uses the times of heartbeat interval to define the minimum value for stale interval.
|
||||||
|
public static final String DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_KEY = "dfs.namenode.stale.datanode.minimum.interval";
|
||||||
|
public static final int DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_DEFAULT = 3; // i.e. min_interval is 3 * heartbeat_interval = 9s
|
||||||
|
|
||||||
|
// When the number stale datanodes marked as stale reached this certian ratio,
|
||||||
|
// stop avoiding writing to stale nodes so as to prevent causing hotspots.
|
||||||
|
public static final String DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY = "dfs.namenode.write.stale.datanode.ratio";
|
||||||
|
public static final float DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT = 0.5f;
|
||||||
|
|
||||||
// Replication monitoring related keys
|
// Replication monitoring related keys
|
||||||
public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION =
|
public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION =
|
||||||
|
|
|
@ -62,6 +62,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
protected NetworkTopology clusterMap;
|
protected NetworkTopology clusterMap;
|
||||||
private FSClusterStats stats;
|
private FSClusterStats stats;
|
||||||
protected long heartbeatInterval; // interval for DataNode heartbeats
|
protected long heartbeatInterval; // interval for DataNode heartbeats
|
||||||
|
private long staleInterval; // interval used to identify stale DataNodes
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A miss of that many heartbeats is tolerated for replica deletion policy.
|
* A miss of that many heartbeats is tolerated for replica deletion policy.
|
||||||
*/
|
*/
|
||||||
|
@ -78,7 +80,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
@Override
|
@Override
|
||||||
public void initialize(Configuration conf, FSClusterStats stats,
|
public void initialize(Configuration conf, FSClusterStats stats,
|
||||||
NetworkTopology clusterMap) {
|
NetworkTopology clusterMap) {
|
||||||
this.considerLoad = conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, true);
|
this.considerLoad = conf.getBoolean(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, true);
|
||||||
this.stats = stats;
|
this.stats = stats;
|
||||||
this.clusterMap = clusterMap;
|
this.clusterMap = clusterMap;
|
||||||
this.heartbeatInterval = conf.getLong(
|
this.heartbeatInterval = conf.getLong(
|
||||||
|
@ -87,6 +90,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
this.tolerateHeartbeatMultiplier = conf.getInt(
|
this.tolerateHeartbeatMultiplier = conf.getInt(
|
||||||
DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_KEY,
|
DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_KEY,
|
||||||
DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_DEFAULT);
|
DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_DEFAULT);
|
||||||
|
this.staleInterval = conf.getLong(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected ThreadLocal<StringBuilder> threadLocalBuilder =
|
protected ThreadLocal<StringBuilder> threadLocalBuilder =
|
||||||
|
@ -155,9 +161,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
writer=null;
|
writer=null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean avoidStaleNodes = (stats != null
|
||||||
|
&& stats.isAvoidingStaleDataNodesForWrite());
|
||||||
DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer,
|
DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer,
|
||||||
excludedNodes, blocksize,
|
excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
maxNodesPerRack, results);
|
|
||||||
if (!returnChosenNodes) {
|
if (!returnChosenNodes) {
|
||||||
results.removeAll(chosenNodes);
|
results.removeAll(chosenNodes);
|
||||||
}
|
}
|
||||||
|
@ -173,8 +180,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
HashMap<Node, Node> excludedNodes,
|
HashMap<Node, Node> excludedNodes,
|
||||||
long blocksize,
|
long blocksize,
|
||||||
int maxNodesPerRack,
|
int maxNodesPerRack,
|
||||||
List<DatanodeDescriptor> results) {
|
List<DatanodeDescriptor> results,
|
||||||
|
final boolean avoidStaleNodes) {
|
||||||
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
|
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
|
||||||
return writer;
|
return writer;
|
||||||
}
|
}
|
||||||
|
@ -186,17 +193,20 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
writer = results.get(0);
|
writer = results.get(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Keep a copy of original excludedNodes
|
||||||
|
final HashMap<Node, Node> oldExcludedNodes = avoidStaleNodes ?
|
||||||
|
new HashMap<Node, Node>(excludedNodes) : null;
|
||||||
try {
|
try {
|
||||||
if (numOfResults == 0) {
|
if (numOfResults == 0) {
|
||||||
writer = chooseLocalNode(writer, excludedNodes,
|
writer = chooseLocalNode(writer, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
if (--numOfReplicas == 0) {
|
if (--numOfReplicas == 0) {
|
||||||
return writer;
|
return writer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (numOfResults <= 1) {
|
if (numOfResults <= 1) {
|
||||||
chooseRemoteRack(1, results.get(0), excludedNodes,
|
chooseRemoteRack(1, results.get(0), excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
if (--numOfReplicas == 0) {
|
if (--numOfReplicas == 0) {
|
||||||
return writer;
|
return writer;
|
||||||
}
|
}
|
||||||
|
@ -204,24 +214,36 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
if (numOfResults <= 2) {
|
if (numOfResults <= 2) {
|
||||||
if (clusterMap.isOnSameRack(results.get(0), results.get(1))) {
|
if (clusterMap.isOnSameRack(results.get(0), results.get(1))) {
|
||||||
chooseRemoteRack(1, results.get(0), excludedNodes,
|
chooseRemoteRack(1, results.get(0), excludedNodes,
|
||||||
blocksize, maxNodesPerRack, results);
|
blocksize, maxNodesPerRack,
|
||||||
|
results, avoidStaleNodes);
|
||||||
} else if (newBlock){
|
} else if (newBlock){
|
||||||
chooseLocalRack(results.get(1), excludedNodes, blocksize,
|
chooseLocalRack(results.get(1), excludedNodes, blocksize,
|
||||||
maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
} else {
|
} else {
|
||||||
chooseLocalRack(writer, excludedNodes, blocksize,
|
chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack,
|
||||||
maxNodesPerRack, results);
|
results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
if (--numOfReplicas == 0) {
|
if (--numOfReplicas == 0) {
|
||||||
return writer;
|
return writer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes,
|
chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
} catch (NotEnoughReplicasException e) {
|
} catch (NotEnoughReplicasException e) {
|
||||||
LOG.warn("Not able to place enough replicas, still in need of "
|
LOG.warn("Not able to place enough replicas, still in need of "
|
||||||
+ numOfReplicas + " to reach " + totalReplicasExpected + "\n"
|
+ numOfReplicas + " to reach " + totalReplicasExpected + "\n"
|
||||||
+ e.getMessage());
|
+ e.getMessage());
|
||||||
|
if (avoidStaleNodes) {
|
||||||
|
// ecxludedNodes now has - initial excludedNodes, any nodes that were
|
||||||
|
// chosen and nodes that were tried but were not chosen because they
|
||||||
|
// were stale, decommissioned or for any other reason a node is not
|
||||||
|
// chosen for write. Retry again now not avoiding stale node
|
||||||
|
for (Node node : results) {
|
||||||
|
oldExcludedNodes.put(node, node);
|
||||||
|
}
|
||||||
|
return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize,
|
||||||
|
maxNodesPerRack, results, false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return writer;
|
return writer;
|
||||||
}
|
}
|
||||||
|
@ -236,26 +258,27 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
HashMap<Node, Node> excludedNodes,
|
HashMap<Node, Node> excludedNodes,
|
||||||
long blocksize,
|
long blocksize,
|
||||||
int maxNodesPerRack,
|
int maxNodesPerRack,
|
||||||
List<DatanodeDescriptor> results)
|
List<DatanodeDescriptor> results,
|
||||||
|
boolean avoidStaleNodes)
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
// if no local machine, randomly choose one node
|
// if no local machine, randomly choose one node
|
||||||
if (localMachine == null)
|
if (localMachine == null)
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
if (preferLocalNode) {
|
if (preferLocalNode) {
|
||||||
// otherwise try local machine first
|
// otherwise try local machine first
|
||||||
Node oldNode = excludedNodes.put(localMachine, localMachine);
|
Node oldNode = excludedNodes.put(localMachine, localMachine);
|
||||||
if (oldNode == null) { // was not in the excluded list
|
if (oldNode == null) { // was not in the excluded list
|
||||||
if (isGoodTarget(localMachine, blocksize,
|
if (isGoodTarget(localMachine, blocksize, maxNodesPerRack, false,
|
||||||
maxNodesPerRack, false, results)) {
|
results, avoidStaleNodes)) {
|
||||||
results.add(localMachine);
|
results.add(localMachine);
|
||||||
return localMachine;
|
return localMachine;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// try a node on local rack
|
// try a node on local rack
|
||||||
return chooseLocalRack(localMachine, excludedNodes,
|
return chooseLocalRack(localMachine, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* choose one node from the rack that <i>localMachine</i> is on.
|
/* choose one node from the rack that <i>localMachine</i> is on.
|
||||||
|
@ -270,19 +293,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
HashMap<Node, Node> excludedNodes,
|
HashMap<Node, Node> excludedNodes,
|
||||||
long blocksize,
|
long blocksize,
|
||||||
int maxNodesPerRack,
|
int maxNodesPerRack,
|
||||||
List<DatanodeDescriptor> results)
|
List<DatanodeDescriptor> results,
|
||||||
|
boolean avoidStaleNodes)
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
// no local machine, so choose a random machine
|
// no local machine, so choose a random machine
|
||||||
if (localMachine == null) {
|
if (localMachine == null) {
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// choose one from the local rack
|
// choose one from the local rack
|
||||||
try {
|
try {
|
||||||
return chooseRandom(
|
return chooseRandom(localMachine.getNetworkLocation(), excludedNodes,
|
||||||
localMachine.getNetworkLocation(),
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
excludedNodes, blocksize, maxNodesPerRack, results);
|
|
||||||
} catch (NotEnoughReplicasException e1) {
|
} catch (NotEnoughReplicasException e1) {
|
||||||
// find the second replica
|
// find the second replica
|
||||||
DatanodeDescriptor newLocal=null;
|
DatanodeDescriptor newLocal=null;
|
||||||
|
@ -296,18 +319,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
}
|
}
|
||||||
if (newLocal != null) {
|
if (newLocal != null) {
|
||||||
try {
|
try {
|
||||||
return chooseRandom(
|
return chooseRandom(newLocal.getNetworkLocation(), excludedNodes,
|
||||||
newLocal.getNetworkLocation(),
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
excludedNodes, blocksize, maxNodesPerRack, results);
|
|
||||||
} catch(NotEnoughReplicasException e2) {
|
} catch(NotEnoughReplicasException e2) {
|
||||||
//otherwise randomly choose one from the network
|
//otherwise randomly choose one from the network
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//otherwise randomly choose one from the network
|
//otherwise randomly choose one from the network
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -323,17 +345,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
HashMap<Node, Node> excludedNodes,
|
HashMap<Node, Node> excludedNodes,
|
||||||
long blocksize,
|
long blocksize,
|
||||||
int maxReplicasPerRack,
|
int maxReplicasPerRack,
|
||||||
List<DatanodeDescriptor> results)
|
List<DatanodeDescriptor> results,
|
||||||
|
boolean avoidStaleNodes)
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
int oldNumOfReplicas = results.size();
|
int oldNumOfReplicas = results.size();
|
||||||
// randomly choose one node from remote racks
|
// randomly choose one node from remote racks
|
||||||
try {
|
try {
|
||||||
chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(),
|
chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(),
|
||||||
excludedNodes, blocksize, maxReplicasPerRack, results);
|
excludedNodes, blocksize, maxReplicasPerRack, results,
|
||||||
|
avoidStaleNodes);
|
||||||
} catch (NotEnoughReplicasException e) {
|
} catch (NotEnoughReplicasException e) {
|
||||||
chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
|
chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
|
||||||
localMachine.getNetworkLocation(), excludedNodes, blocksize,
|
localMachine.getNetworkLocation(), excludedNodes, blocksize,
|
||||||
maxReplicasPerRack, results);
|
maxReplicasPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -345,7 +369,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
HashMap<Node, Node> excludedNodes,
|
HashMap<Node, Node> excludedNodes,
|
||||||
long blocksize,
|
long blocksize,
|
||||||
int maxNodesPerRack,
|
int maxNodesPerRack,
|
||||||
List<DatanodeDescriptor> results)
|
List<DatanodeDescriptor> results,
|
||||||
|
boolean avoidStaleNodes)
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
int numOfAvailableNodes =
|
int numOfAvailableNodes =
|
||||||
clusterMap.countNumOfAvailableNodes(nodes, excludedNodes.keySet());
|
clusterMap.countNumOfAvailableNodes(nodes, excludedNodes.keySet());
|
||||||
|
@ -363,7 +388,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
Node oldNode = excludedNodes.put(chosenNode, chosenNode);
|
Node oldNode = excludedNodes.put(chosenNode, chosenNode);
|
||||||
if (oldNode == null) { // chosenNode was not in the excluded list
|
if (oldNode == null) { // chosenNode was not in the excluded list
|
||||||
numOfAvailableNodes--;
|
numOfAvailableNodes--;
|
||||||
if (isGoodTarget(chosenNode, blocksize, maxNodesPerRack, results)) {
|
if (isGoodTarget(chosenNode, blocksize,
|
||||||
|
maxNodesPerRack, results, avoidStaleNodes)) {
|
||||||
results.add(chosenNode);
|
results.add(chosenNode);
|
||||||
adjustExcludedNodes(excludedNodes, chosenNode);
|
adjustExcludedNodes(excludedNodes, chosenNode);
|
||||||
return chosenNode;
|
return chosenNode;
|
||||||
|
@ -390,7 +416,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
HashMap<Node, Node> excludedNodes,
|
HashMap<Node, Node> excludedNodes,
|
||||||
long blocksize,
|
long blocksize,
|
||||||
int maxNodesPerRack,
|
int maxNodesPerRack,
|
||||||
List<DatanodeDescriptor> results)
|
List<DatanodeDescriptor> results,
|
||||||
|
boolean avoidStaleNodes)
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
|
|
||||||
int numOfAvailableNodes =
|
int numOfAvailableNodes =
|
||||||
|
@ -409,7 +436,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
if (oldNode == null) {
|
if (oldNode == null) {
|
||||||
numOfAvailableNodes--;
|
numOfAvailableNodes--;
|
||||||
|
|
||||||
if (isGoodTarget(chosenNode, blocksize, maxNodesPerRack, results)) {
|
if (isGoodTarget(chosenNode, blocksize,
|
||||||
|
maxNodesPerRack, results, avoidStaleNodes)) {
|
||||||
numOfReplicas--;
|
numOfReplicas--;
|
||||||
results.add(chosenNode);
|
results.add(chosenNode);
|
||||||
adjustExcludedNodes(excludedNodes, chosenNode);
|
adjustExcludedNodes(excludedNodes, chosenNode);
|
||||||
|
@ -451,9 +479,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
*/
|
*/
|
||||||
private boolean isGoodTarget(DatanodeDescriptor node,
|
private boolean isGoodTarget(DatanodeDescriptor node,
|
||||||
long blockSize, int maxTargetPerRack,
|
long blockSize, int maxTargetPerRack,
|
||||||
List<DatanodeDescriptor> results) {
|
List<DatanodeDescriptor> results,
|
||||||
return isGoodTarget(node, blockSize, maxTargetPerRack,
|
boolean avoidStaleNodes) {
|
||||||
this.considerLoad, results);
|
return isGoodTarget(node, blockSize, maxTargetPerRack, this.considerLoad,
|
||||||
|
results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -467,6 +496,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
* @param considerLoad whether or not to consider load of the target node
|
* @param considerLoad whether or not to consider load of the target node
|
||||||
* @param results A list containing currently chosen nodes. Used to check if
|
* @param results A list containing currently chosen nodes. Used to check if
|
||||||
* too many nodes has been chosen in the target rack.
|
* too many nodes has been chosen in the target rack.
|
||||||
|
* @param avoidStaleNodes Whether or not to avoid choosing stale nodes
|
||||||
* @return Return true if <i>node</i> has enough space,
|
* @return Return true if <i>node</i> has enough space,
|
||||||
* does not have too much load,
|
* does not have too much load,
|
||||||
* and the rack does not have too many nodes.
|
* and the rack does not have too many nodes.
|
||||||
|
@ -474,7 +504,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
protected boolean isGoodTarget(DatanodeDescriptor node,
|
protected boolean isGoodTarget(DatanodeDescriptor node,
|
||||||
long blockSize, int maxTargetPerRack,
|
long blockSize, int maxTargetPerRack,
|
||||||
boolean considerLoad,
|
boolean considerLoad,
|
||||||
List<DatanodeDescriptor> results) {
|
List<DatanodeDescriptor> results,
|
||||||
|
boolean avoidStaleNodes) {
|
||||||
// check if the node is (being) decommissed
|
// check if the node is (being) decommissed
|
||||||
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
||||||
if(LOG.isDebugEnabled()) {
|
if(LOG.isDebugEnabled()) {
|
||||||
|
@ -485,6 +516,17 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (avoidStaleNodes) {
|
||||||
|
if (node.isStale(this.staleInterval)) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
threadLocalBuilder.get().append(node.toString()).append(": ")
|
||||||
|
.append("Node ").append(NodeBase.getPath(node))
|
||||||
|
.append(" is not chosen because the node is staled ");
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
long remaining = node.getRemaining() -
|
long remaining = node.getRemaining() -
|
||||||
(node.getBlocksScheduled() * blockSize);
|
(node.getBlocksScheduled() * blockSize);
|
||||||
// check the remaining capacity of the target machine
|
// check the remaining capacity of the target machine
|
||||||
|
|
|
@ -64,23 +64,20 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
* @return the chosen node
|
* @return the chosen node
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected DatanodeDescriptor chooseLocalNode(
|
protected DatanodeDescriptor chooseLocalNode(DatanodeDescriptor localMachine,
|
||||||
DatanodeDescriptor localMachine,
|
HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
|
||||||
HashMap<Node, Node> excludedNodes,
|
List<DatanodeDescriptor> results, boolean avoidStaleNodes)
|
||||||
long blocksize,
|
|
||||||
int maxNodesPerRack,
|
|
||||||
List<DatanodeDescriptor> results)
|
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
// if no local machine, randomly choose one node
|
// if no local machine, randomly choose one node
|
||||||
if (localMachine == null)
|
if (localMachine == null)
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
||||||
blocksize, maxNodesPerRack, results);
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
|
|
||||||
// otherwise try local machine first
|
// otherwise try local machine first
|
||||||
Node oldNode = excludedNodes.put(localMachine, localMachine);
|
Node oldNode = excludedNodes.put(localMachine, localMachine);
|
||||||
if (oldNode == null) { // was not in the excluded list
|
if (oldNode == null) { // was not in the excluded list
|
||||||
if (isGoodTarget(localMachine, blocksize,
|
if (isGoodTarget(localMachine, blocksize,
|
||||||
maxNodesPerRack, false, results)) {
|
maxNodesPerRack, false, results, avoidStaleNodes)) {
|
||||||
results.add(localMachine);
|
results.add(localMachine);
|
||||||
// Nodes under same nodegroup should be excluded.
|
// Nodes under same nodegroup should be excluded.
|
||||||
addNodeGroupToExcludedNodes(excludedNodes,
|
addNodeGroupToExcludedNodes(excludedNodes,
|
||||||
|
@ -92,13 +89,13 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
// try a node on local node group
|
// try a node on local node group
|
||||||
DatanodeDescriptor chosenNode = chooseLocalNodeGroup(
|
DatanodeDescriptor chosenNode = chooseLocalNodeGroup(
|
||||||
(NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
|
(NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
|
||||||
blocksize, maxNodesPerRack, results);
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
if (chosenNode != null) {
|
if (chosenNode != null) {
|
||||||
return chosenNode;
|
return chosenNode;
|
||||||
}
|
}
|
||||||
// try a node on local rack
|
// try a node on local rack
|
||||||
return chooseLocalRack(localMachine, excludedNodes,
|
return chooseLocalRack(localMachine, excludedNodes,
|
||||||
blocksize, maxNodesPerRack, results);
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -119,17 +116,15 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected DatanodeDescriptor chooseLocalRack(
|
protected DatanodeDescriptor chooseLocalRack(DatanodeDescriptor localMachine,
|
||||||
DatanodeDescriptor localMachine,
|
HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
|
||||||
HashMap<Node, Node> excludedNodes,
|
List<DatanodeDescriptor> results, boolean avoidStaleNodes)
|
||||||
long blocksize,
|
|
||||||
int maxNodesPerRack,
|
|
||||||
List<DatanodeDescriptor> results)
|
|
||||||
throws NotEnoughReplicasException {
|
throws NotEnoughReplicasException {
|
||||||
// no local machine, so choose a random machine
|
// no local machine, so choose a random machine
|
||||||
if (localMachine == null) {
|
if (localMachine == null) {
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
||||||
blocksize, maxNodesPerRack, results);
|
blocksize, maxNodesPerRack, results,
|
||||||
|
avoidStaleNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// choose one from the local rack, but off-nodegroup
|
// choose one from the local rack, but off-nodegroup
|
||||||
|
@ -137,7 +132,8 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
return chooseRandom(NetworkTopology.getFirstHalf(
|
return chooseRandom(NetworkTopology.getFirstHalf(
|
||||||
localMachine.getNetworkLocation()),
|
localMachine.getNetworkLocation()),
|
||||||
excludedNodes, blocksize,
|
excludedNodes, blocksize,
|
||||||
maxNodesPerRack, results);
|
maxNodesPerRack, results,
|
||||||
|
avoidStaleNodes);
|
||||||
} catch (NotEnoughReplicasException e1) {
|
} catch (NotEnoughReplicasException e1) {
|
||||||
// find the second replica
|
// find the second replica
|
||||||
DatanodeDescriptor newLocal=null;
|
DatanodeDescriptor newLocal=null;
|
||||||
|
@ -151,39 +147,39 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
}
|
}
|
||||||
if (newLocal != null) {
|
if (newLocal != null) {
|
||||||
try {
|
try {
|
||||||
return chooseRandom(clusterMap.getRack(newLocal.getNetworkLocation()),
|
return chooseRandom(
|
||||||
excludedNodes, blocksize, maxNodesPerRack, results);
|
clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
|
||||||
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
} catch(NotEnoughReplicasException e2) {
|
} catch(NotEnoughReplicasException e2) {
|
||||||
//otherwise randomly choose one from the network
|
//otherwise randomly choose one from the network
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//otherwise randomly choose one from the network
|
//otherwise randomly choose one from the network
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void chooseRemoteRack(int numOfReplicas,
|
protected void chooseRemoteRack(int numOfReplicas,
|
||||||
DatanodeDescriptor localMachine,
|
DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes,
|
||||||
HashMap<Node, Node> excludedNodes,
|
long blocksize, int maxReplicasPerRack, List<DatanodeDescriptor> results,
|
||||||
long blocksize,
|
boolean avoidStaleNodes) throws NotEnoughReplicasException {
|
||||||
int maxReplicasPerRack,
|
|
||||||
List<DatanodeDescriptor> results)
|
|
||||||
throws NotEnoughReplicasException {
|
|
||||||
int oldNumOfReplicas = results.size();
|
int oldNumOfReplicas = results.size();
|
||||||
// randomly choose one node from remote racks
|
// randomly choose one node from remote racks
|
||||||
try {
|
try {
|
||||||
chooseRandom(numOfReplicas, "~"+NetworkTopology.getFirstHalf(
|
chooseRandom(
|
||||||
localMachine.getNetworkLocation()),
|
numOfReplicas,
|
||||||
excludedNodes, blocksize, maxReplicasPerRack, results);
|
"~" + NetworkTopology.getFirstHalf(localMachine.getNetworkLocation()),
|
||||||
|
excludedNodes, blocksize, maxReplicasPerRack, results,
|
||||||
|
avoidStaleNodes);
|
||||||
} catch (NotEnoughReplicasException e) {
|
} catch (NotEnoughReplicasException e) {
|
||||||
chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
|
chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
|
||||||
localMachine.getNetworkLocation(), excludedNodes, blocksize,
|
localMachine.getNetworkLocation(), excludedNodes, blocksize,
|
||||||
maxReplicasPerRack, results);
|
maxReplicasPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,19 +189,22 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
* if still no such node is available, choose a random node in the cluster.
|
* if still no such node is available, choose a random node in the cluster.
|
||||||
* @return the chosen node
|
* @return the chosen node
|
||||||
*/
|
*/
|
||||||
private DatanodeDescriptor chooseLocalNodeGroup(NetworkTopologyWithNodeGroup clusterMap,
|
private DatanodeDescriptor chooseLocalNodeGroup(
|
||||||
DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize,
|
NetworkTopologyWithNodeGroup clusterMap, DatanodeDescriptor localMachine,
|
||||||
int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException {
|
HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,
|
||||||
|
List<DatanodeDescriptor> results, boolean avoidStaleNodes)
|
||||||
|
throws NotEnoughReplicasException {
|
||||||
// no local machine, so choose a random machine
|
// no local machine, so choose a random machine
|
||||||
if (localMachine == null) {
|
if (localMachine == null) {
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
||||||
blocksize, maxNodesPerRack, results);
|
blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// choose one from the local node group
|
// choose one from the local node group
|
||||||
try {
|
try {
|
||||||
return chooseRandom(clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
|
return chooseRandom(
|
||||||
excludedNodes, blocksize, maxNodesPerRack, results);
|
clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
|
||||||
|
excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes);
|
||||||
} catch (NotEnoughReplicasException e1) {
|
} catch (NotEnoughReplicasException e1) {
|
||||||
// find the second replica
|
// find the second replica
|
||||||
DatanodeDescriptor newLocal=null;
|
DatanodeDescriptor newLocal=null;
|
||||||
|
@ -219,17 +218,19 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
|
||||||
}
|
}
|
||||||
if (newLocal != null) {
|
if (newLocal != null) {
|
||||||
try {
|
try {
|
||||||
return chooseRandom(clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
|
return chooseRandom(
|
||||||
excludedNodes, blocksize, maxNodesPerRack, results);
|
clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
|
||||||
|
excludedNodes, blocksize, maxNodesPerRack, results,
|
||||||
|
avoidStaleNodes);
|
||||||
} catch(NotEnoughReplicasException e2) {
|
} catch(NotEnoughReplicasException e2) {
|
||||||
//otherwise randomly choose one from the network
|
//otherwise randomly choose one from the network
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//otherwise randomly choose one from the network
|
//otherwise randomly choose one from the network
|
||||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
|
||||||
blocksize, maxNodesPerRack, results);
|
maxNodesPerRack, results, avoidStaleNodes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,7 @@ import org.apache.hadoop.util.ReflectionUtils;
|
||||||
import org.apache.hadoop.util.Time;
|
import org.apache.hadoop.util.Time;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.net.InetAddresses;
|
import com.google.common.net.InetAddresses;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -88,8 +89,8 @@ public class DatanodeManager {
|
||||||
|
|
||||||
private final Namesystem namesystem;
|
private final Namesystem namesystem;
|
||||||
private final BlockManager blockManager;
|
private final BlockManager blockManager;
|
||||||
|
|
||||||
private final HeartbeatManager heartbeatManager;
|
private final HeartbeatManager heartbeatManager;
|
||||||
|
private Daemon decommissionthread = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stores the datanode -> block map.
|
* Stores the datanode -> block map.
|
||||||
|
@ -127,28 +128,33 @@ public class DatanodeManager {
|
||||||
/** Ask Datanode only up to this many blocks to delete. */
|
/** Ask Datanode only up to this many blocks to delete. */
|
||||||
final int blockInvalidateLimit;
|
final int blockInvalidateLimit;
|
||||||
|
|
||||||
|
/** Whether or not to check stale DataNodes for read/write */
|
||||||
|
private final boolean checkForStaleDataNodes;
|
||||||
|
|
||||||
|
/** The interval for judging stale DataNodes for read/write */
|
||||||
|
private final long staleInterval;
|
||||||
|
|
||||||
|
/** Whether or not to avoid using stale DataNodes for writing */
|
||||||
|
private volatile boolean avoidStaleDataNodesForWrite;
|
||||||
|
|
||||||
|
/** The number of stale DataNodes */
|
||||||
|
private volatile int numStaleNodes;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether or not this cluster has ever consisted of more than 1 rack,
|
* Whether or not this cluster has ever consisted of more than 1 rack,
|
||||||
* according to the NetworkTopology.
|
* according to the NetworkTopology.
|
||||||
*/
|
*/
|
||||||
private boolean hasClusterEverBeenMultiRack = false;
|
private boolean hasClusterEverBeenMultiRack = false;
|
||||||
|
|
||||||
/** Whether or not to check the stale datanodes */
|
DatanodeManager(final BlockManager blockManager, final Namesystem namesystem,
|
||||||
private volatile boolean checkForStaleNodes;
|
final Configuration conf) throws IOException {
|
||||||
/** The time interval for detecting stale datanodes */
|
|
||||||
private volatile long staleInterval;
|
|
||||||
|
|
||||||
DatanodeManager(final BlockManager blockManager,
|
|
||||||
final Namesystem namesystem, final Configuration conf
|
|
||||||
) throws IOException {
|
|
||||||
this.namesystem = namesystem;
|
this.namesystem = namesystem;
|
||||||
this.blockManager = blockManager;
|
this.blockManager = blockManager;
|
||||||
|
|
||||||
Class<? extends NetworkTopology> networkTopologyClass =
|
Class<? extends NetworkTopology> networkTopologyClass =
|
||||||
conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
|
conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_IMPL_KEY,
|
||||||
NetworkTopology.class, NetworkTopology.class);
|
NetworkTopology.class, NetworkTopology.class);
|
||||||
networktopology = (NetworkTopology) ReflectionUtils.newInstance(
|
networktopology = ReflectionUtils.newInstance(networkTopologyClass, conf);
|
||||||
networkTopologyClass, conf);
|
|
||||||
|
|
||||||
this.heartbeatManager = new HeartbeatManager(namesystem, blockManager, conf);
|
this.heartbeatManager = new HeartbeatManager(namesystem, blockManager, conf);
|
||||||
|
|
||||||
|
@ -181,24 +187,68 @@ public class DatanodeManager {
|
||||||
DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY, blockInvalidateLimit);
|
DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY, blockInvalidateLimit);
|
||||||
LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY
|
LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY
|
||||||
+ "=" + this.blockInvalidateLimit);
|
+ "=" + this.blockInvalidateLimit);
|
||||||
// set the value of stale interval based on configuration
|
|
||||||
this.checkForStaleNodes = conf.getBoolean(
|
checkForStaleDataNodes = conf.getBoolean(
|
||||||
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY,
|
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY,
|
||||||
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT);
|
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT);
|
||||||
if (this.checkForStaleNodes) {
|
|
||||||
this.staleInterval = conf.getLong(
|
staleInterval = getStaleIntervalFromConf(conf, heartbeatExpireInterval);
|
||||||
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
|
avoidStaleDataNodesForWrite = getAvoidStaleForWriteFromConf(conf,
|
||||||
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT);
|
checkForStaleDataNodes);
|
||||||
if (this.staleInterval < DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT) {
|
|
||||||
LOG.warn("The given interval for marking stale datanode = "
|
|
||||||
+ this.staleInterval + ", which is smaller than the default value "
|
|
||||||
+ DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_MILLI_DEFAULT
|
|
||||||
+ ".");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Daemon decommissionthread = null;
|
private static long getStaleIntervalFromConf(Configuration conf,
|
||||||
|
long heartbeatExpireInterval) {
|
||||||
|
long staleInterval = conf.getLong(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);
|
||||||
|
Preconditions.checkArgument(staleInterval > 0,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY +
|
||||||
|
" = '" + staleInterval + "' is invalid. " +
|
||||||
|
"It should be a positive non-zero value.");
|
||||||
|
|
||||||
|
final long heartbeatIntervalSeconds = conf.getLong(
|
||||||
|
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT);
|
||||||
|
// The stale interval value cannot be smaller than
|
||||||
|
// 3 times of heartbeat interval
|
||||||
|
final long minStaleInterval = conf.getInt(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_DEFAULT)
|
||||||
|
* heartbeatIntervalSeconds * 1000;
|
||||||
|
if (staleInterval < minStaleInterval) {
|
||||||
|
LOG.warn("The given interval for marking stale datanode = "
|
||||||
|
+ staleInterval + ", which is less than "
|
||||||
|
+ DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_MINIMUM_INTERVAL_DEFAULT
|
||||||
|
+ " heartbeat intervals. This may cause too frequent changes of "
|
||||||
|
+ "stale states of DataNodes since a heartbeat msg may be missing "
|
||||||
|
+ "due to temporary short-term failures. Reset stale interval to "
|
||||||
|
+ minStaleInterval + ".");
|
||||||
|
staleInterval = minStaleInterval;
|
||||||
|
}
|
||||||
|
if (staleInterval > heartbeatExpireInterval) {
|
||||||
|
LOG.warn("The given interval for marking stale datanode = "
|
||||||
|
+ staleInterval + ", which is larger than heartbeat expire interval "
|
||||||
|
+ heartbeatExpireInterval + ".");
|
||||||
|
}
|
||||||
|
return staleInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean getAvoidStaleForWriteFromConf(Configuration conf,
|
||||||
|
boolean checkForStale) {
|
||||||
|
boolean avoid = conf.getBoolean(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_DEFAULT);
|
||||||
|
boolean avoidStaleDataNodesForWrite = checkForStale && avoid;
|
||||||
|
if (!checkForStale && avoid) {
|
||||||
|
LOG.warn("Cannot set "
|
||||||
|
+ DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY
|
||||||
|
+ " as false while setting "
|
||||||
|
+ DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY
|
||||||
|
+ " as true.");
|
||||||
|
}
|
||||||
|
return avoidStaleDataNodesForWrite;
|
||||||
|
}
|
||||||
|
|
||||||
void activate(final Configuration conf) {
|
void activate(final Configuration conf) {
|
||||||
final DecommissionManager dm = new DecommissionManager(namesystem, blockManager);
|
final DecommissionManager dm = new DecommissionManager(namesystem, blockManager);
|
||||||
|
@ -253,9 +303,10 @@ public class DatanodeManager {
|
||||||
client = new NodeBase(rName + NodeBase.PATH_SEPARATOR_STR + targethost);
|
client = new NodeBase(rName + NodeBase.PATH_SEPARATOR_STR + targethost);
|
||||||
}
|
}
|
||||||
|
|
||||||
Comparator<DatanodeInfo> comparator = checkForStaleNodes ?
|
Comparator<DatanodeInfo> comparator = checkForStaleDataNodes ?
|
||||||
new DFSUtil.DecomStaleComparator(staleInterval) :
|
new DFSUtil.DecomStaleComparator(staleInterval) :
|
||||||
DFSUtil.DECOM_COMPARATOR;
|
DFSUtil.DECOM_COMPARATOR;
|
||||||
|
|
||||||
for (LocatedBlock b : locatedblocks) {
|
for (LocatedBlock b : locatedblocks) {
|
||||||
networktopology.pseudoSortByDistance(client, b.getLocations());
|
networktopology.pseudoSortByDistance(client, b.getLocations());
|
||||||
// Move decommissioned/stale datanodes to the bottom
|
// Move decommissioned/stale datanodes to the bottom
|
||||||
|
@ -723,7 +774,7 @@ public class DatanodeManager {
|
||||||
* 3. Added to exclude --> start decommission.
|
* 3. Added to exclude --> start decommission.
|
||||||
* 4. Removed from exclude --> stop decommission.
|
* 4. Removed from exclude --> stop decommission.
|
||||||
*/
|
*/
|
||||||
private void refreshDatanodes() throws IOException {
|
private void refreshDatanodes() {
|
||||||
for(DatanodeDescriptor node : datanodeMap.values()) {
|
for(DatanodeDescriptor node : datanodeMap.values()) {
|
||||||
// Check if not include.
|
// Check if not include.
|
||||||
if (!inHostsList(node)) {
|
if (!inHostsList(node)) {
|
||||||
|
@ -783,6 +834,60 @@ public class DatanodeManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Getter and Setter for stale DataNodes related attributes */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether or not to avoid writing to stale datanodes
|
||||||
|
*/
|
||||||
|
public boolean isAvoidingStaleDataNodesForWrite() {
|
||||||
|
return avoidStaleDataNodesForWrite;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the value of {@link DatanodeManager#avoidStaleDataNodesForWrite}.
|
||||||
|
* The HeartbeatManager disable avoidStaleDataNodesForWrite when more than
|
||||||
|
* half of the DataNodes are marked as stale.
|
||||||
|
*
|
||||||
|
* @param avoidStaleDataNodesForWrite
|
||||||
|
* The value to set to
|
||||||
|
* {@link DatanodeManager#avoidStaleDataNodesForWrite}
|
||||||
|
*/
|
||||||
|
void setAvoidStaleDataNodesForWrite(boolean avoidStaleDataNodesForWrite) {
|
||||||
|
this.avoidStaleDataNodesForWrite = avoidStaleDataNodesForWrite;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Whether or not to check stale DataNodes for R/W
|
||||||
|
*/
|
||||||
|
boolean isCheckingForStaleDataNodes() {
|
||||||
|
return checkForStaleDataNodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return The time interval used to mark DataNodes as stale.
|
||||||
|
*/
|
||||||
|
long getStaleInterval() {
|
||||||
|
return staleInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the number of current stale DataNodes. The HeartbeatManager got this
|
||||||
|
* number based on DataNodes' heartbeats.
|
||||||
|
*
|
||||||
|
* @param numStaleNodes
|
||||||
|
* The number of stale DataNodes to be set.
|
||||||
|
*/
|
||||||
|
void setNumStaleNodes(int numStaleNodes) {
|
||||||
|
this.numStaleNodes = numStaleNodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Return the current number of stale DataNodes (detected by
|
||||||
|
* HeartbeatManager).
|
||||||
|
*/
|
||||||
|
int getNumStaleNodes() {
|
||||||
|
return this.numStaleNodes;
|
||||||
|
}
|
||||||
|
|
||||||
/** Fetch live and dead datanodes. */
|
/** Fetch live and dead datanodes. */
|
||||||
public void fetchDatanodes(final List<DatanodeDescriptor> live,
|
public void fetchDatanodes(final List<DatanodeDescriptor> live,
|
||||||
|
@ -961,7 +1066,7 @@ public class DatanodeManager {
|
||||||
return nodes;
|
return nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setDatanodeDead(DatanodeDescriptor node) throws IOException {
|
private void setDatanodeDead(DatanodeDescriptor node) {
|
||||||
node.setLastUpdate(0);
|
node.setLastUpdate(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,8 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
|
||||||
import org.apache.hadoop.util.Daemon;
|
import org.apache.hadoop.util.Daemon;
|
||||||
import org.apache.hadoop.util.Time;
|
import org.apache.hadoop.util.Time;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Manage the heartbeats received from datanodes.
|
* Manage the heartbeats received from datanodes.
|
||||||
* The datanode list and statistics are synchronized
|
* The datanode list and statistics are synchronized
|
||||||
|
@ -54,18 +56,48 @@ class HeartbeatManager implements DatanodeStatistics {
|
||||||
private final long heartbeatRecheckInterval;
|
private final long heartbeatRecheckInterval;
|
||||||
/** Heartbeat monitor thread */
|
/** Heartbeat monitor thread */
|
||||||
private final Daemon heartbeatThread = new Daemon(new Monitor());
|
private final Daemon heartbeatThread = new Daemon(new Monitor());
|
||||||
|
/**
|
||||||
|
* The initial setting of end user which indicates whether or not to avoid
|
||||||
|
* writing to stale datanodes.
|
||||||
|
*/
|
||||||
|
private final boolean initialAvoidWriteStaleNodes;
|
||||||
|
/**
|
||||||
|
* When the ratio of stale datanodes reaches this number, stop avoiding
|
||||||
|
* writing to stale datanodes, i.e., continue using stale nodes for writing.
|
||||||
|
*/
|
||||||
|
private final float ratioUseStaleDataNodesForWrite;
|
||||||
|
|
||||||
final Namesystem namesystem;
|
final Namesystem namesystem;
|
||||||
final BlockManager blockManager;
|
final BlockManager blockManager;
|
||||||
|
|
||||||
HeartbeatManager(final Namesystem namesystem, final BlockManager blockManager,
|
HeartbeatManager(final Namesystem namesystem,
|
||||||
final Configuration conf) {
|
final BlockManager blockManager, final Configuration conf) {
|
||||||
this.heartbeatRecheckInterval = conf.getInt(
|
|
||||||
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
|
||||||
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT); // 5 minutes
|
|
||||||
|
|
||||||
this.namesystem = namesystem;
|
this.namesystem = namesystem;
|
||||||
this.blockManager = blockManager;
|
this.blockManager = blockManager;
|
||||||
|
boolean checkStaleNodes = conf.getBoolean(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_DEFAULT);
|
||||||
|
long recheckInterval = conf.getInt(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT); // 5 min
|
||||||
|
long staleInterval = conf.getLong(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);// 30s
|
||||||
|
this.initialAvoidWriteStaleNodes = DatanodeManager
|
||||||
|
.getAvoidStaleForWriteFromConf(conf, checkStaleNodes);
|
||||||
|
this.ratioUseStaleDataNodesForWrite = conf.getFloat(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT);
|
||||||
|
Preconditions.checkArgument(
|
||||||
|
(ratioUseStaleDataNodesForWrite > 0 &&
|
||||||
|
ratioUseStaleDataNodesForWrite <= 1.0f),
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY +
|
||||||
|
" = '" + ratioUseStaleDataNodesForWrite + "' is invalid. " +
|
||||||
|
"It should be a positive non-zero float value, not greater than 1.0f.");
|
||||||
|
|
||||||
|
this.heartbeatRecheckInterval = (checkStaleNodes
|
||||||
|
&& initialAvoidWriteStaleNodes
|
||||||
|
&& staleInterval < recheckInterval) ? staleInterval : recheckInterval;
|
||||||
}
|
}
|
||||||
|
|
||||||
void activate(Configuration conf) {
|
void activate(Configuration conf) {
|
||||||
|
@ -210,18 +242,41 @@ class HeartbeatManager implements DatanodeStatistics {
|
||||||
if (namesystem.isInSafeMode()) {
|
if (namesystem.isInSafeMode()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
boolean checkStaleNodes = dm.isCheckingForStaleDataNodes();
|
||||||
boolean allAlive = false;
|
boolean allAlive = false;
|
||||||
while (!allAlive) {
|
while (!allAlive) {
|
||||||
// locate the first dead node.
|
// locate the first dead node.
|
||||||
DatanodeID dead = null;
|
DatanodeID dead = null;
|
||||||
|
// check the number of stale nodes
|
||||||
|
int numOfStaleNodes = 0;
|
||||||
synchronized(this) {
|
synchronized(this) {
|
||||||
for (DatanodeDescriptor d : datanodes) {
|
for (DatanodeDescriptor d : datanodes) {
|
||||||
if (dm.isDatanodeDead(d)) {
|
if (dead == null && dm.isDatanodeDead(d)) {
|
||||||
stats.incrExpiredHeartbeats();
|
stats.incrExpiredHeartbeats();
|
||||||
dead = d;
|
dead = d;
|
||||||
|
if (!checkStaleNodes) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (checkStaleNodes &&
|
||||||
|
d.isStale(dm.getStaleInterval())) {
|
||||||
|
numOfStaleNodes++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Change whether to avoid using stale datanodes for writing
|
||||||
|
// based on proportion of stale datanodes
|
||||||
|
if (checkStaleNodes) {
|
||||||
|
dm.setNumStaleNodes(numOfStaleNodes);
|
||||||
|
if (numOfStaleNodes >
|
||||||
|
datanodes.size() * ratioUseStaleDataNodesForWrite) {
|
||||||
|
dm.setAvoidStaleDataNodesForWrite(false);
|
||||||
|
} else {
|
||||||
|
if (this.initialAvoidWriteStaleNodes) {
|
||||||
|
dm.setAvoidStaleDataNodesForWrite(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
allAlive = dead == null;
|
allAlive = dead == null;
|
||||||
|
|
|
@ -251,7 +251,7 @@ public class DataNode extends Configured
|
||||||
Daemon dataXceiverServer = null;
|
Daemon dataXceiverServer = null;
|
||||||
ThreadGroup threadGroup = null;
|
ThreadGroup threadGroup = null;
|
||||||
private DNConf dnConf;
|
private DNConf dnConf;
|
||||||
private boolean heartbeatsDisabledForTests = false;
|
private volatile boolean heartbeatsDisabledForTests = false;
|
||||||
private DataStorage storage = null;
|
private DataStorage storage = null;
|
||||||
private HttpServer infoServer = null;
|
private HttpServer infoServer = null;
|
||||||
DataNodeMetrics metrics;
|
DataNodeMetrics metrics;
|
||||||
|
|
|
@ -32,8 +32,16 @@ public interface FSClusterStats {
|
||||||
* @return a count of the total number of block transfers and block
|
* @return a count of the total number of block transfers and block
|
||||||
* writes that are currently occuring on the cluster.
|
* writes that are currently occuring on the cluster.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public int getTotalLoad();
|
public int getTotalLoad();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicate whether or not the cluster is now avoiding
|
||||||
|
* to use stale DataNodes for writing.
|
||||||
|
*
|
||||||
|
* @return True if the cluster is currently avoiding using stale DataNodes
|
||||||
|
* for writing targets, and false otherwise.
|
||||||
|
*/
|
||||||
|
public boolean isAvoidingStaleDataNodesForWrite();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5539,4 +5539,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
|
public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
|
||||||
this.nnResourceChecker = nnResourceChecker;
|
this.nnResourceChecker = nnResourceChecker;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isAvoidingStaleDataNodesForWrite() {
|
||||||
|
return this.blockManager.getDatanodeManager()
|
||||||
|
.isAvoidingStaleDataNodesForWrite();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -991,9 +991,25 @@
|
||||||
Indicate whether or not to check "stale" datanodes whose
|
Indicate whether or not to check "stale" datanodes whose
|
||||||
heartbeat messages have not been received by the namenode
|
heartbeat messages have not been received by the namenode
|
||||||
for more than a specified time interval. If this configuration
|
for more than a specified time interval. If this configuration
|
||||||
parameter is set as true, the stale datanodes will be moved to
|
parameter is set as true, the system will keep track
|
||||||
the end of the target node list for reading. The writing will
|
of the number of stale datanodes. The stale datanodes will be
|
||||||
also try to avoid stale nodes.
|
moved to the end of the node list returned for reading. See
|
||||||
|
dfs.namenode.avoid.write.stale.datanode for details on how this
|
||||||
|
affects writes.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.namenode.avoid.write.stale.datanode</name>
|
||||||
|
<value>false</value>
|
||||||
|
<description>
|
||||||
|
Indicate whether or not to avoid writing to "stale" datanodes whose
|
||||||
|
heartbeat messages have not been received by the namenode
|
||||||
|
for more than a specified time interval. If this configuration
|
||||||
|
parameter and dfs.namenode.check.stale.datanode are both set as true,
|
||||||
|
the writing will avoid using stale datanodes unless a high number
|
||||||
|
of datanodes are marked as stale. See
|
||||||
|
dfs.namenode.write.stale.datanode.ratio for details.
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
@ -1004,7 +1020,21 @@
|
||||||
Default time interval for marking a datanode as "stale", i.e., if
|
Default time interval for marking a datanode as "stale", i.e., if
|
||||||
the namenode has not received heartbeat msg from a datanode for
|
the namenode has not received heartbeat msg from a datanode for
|
||||||
more than this time interval, the datanode will be marked and treated
|
more than this time interval, the datanode will be marked and treated
|
||||||
as "stale" by default.
|
as "stale" by default. The stale interval cannot be too small since
|
||||||
|
otherwise this may cause too frequent change of stale states.
|
||||||
|
We thus set a minimum stale interval value (the default value is 3 times
|
||||||
|
of heartbeat interval) and guarantee that the stale interval cannot be less
|
||||||
|
than the minimum value.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.namenode.write.stale.datanode.ratio</name>
|
||||||
|
<value>0.5f</value>
|
||||||
|
<description>
|
||||||
|
When the ratio of number stale datanodes to total datanodes marked
|
||||||
|
is greater than this ratio, stop avoiding writing to stale nodes so
|
||||||
|
as to prevent causing hotspots.
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
|
|
@ -38,9 +38,12 @@ import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
import org.apache.hadoop.hdfs.protocol.Block;
|
import org.apache.hadoop.hdfs.protocol.Block;
|
||||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.net.NetworkTopology;
|
import org.apache.hadoop.net.NetworkTopology;
|
||||||
import org.apache.hadoop.net.Node;
|
import org.apache.hadoop.net.Node;
|
||||||
|
import org.apache.hadoop.util.Time;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -55,6 +58,9 @@ public class TestReplicationPolicy {
|
||||||
private static BlockPlacementPolicy replicator;
|
private static BlockPlacementPolicy replicator;
|
||||||
private static final String filename = "/dummyfile.txt";
|
private static final String filename = "/dummyfile.txt";
|
||||||
private static DatanodeDescriptor dataNodes[];
|
private static DatanodeDescriptor dataNodes[];
|
||||||
|
// The interval for marking a datanode as stale,
|
||||||
|
private static long staleInterval =
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT;
|
||||||
|
|
||||||
@Rule
|
@Rule
|
||||||
public ExpectedException exception = ExpectedException.none();
|
public ExpectedException exception = ExpectedException.none();
|
||||||
|
@ -77,6 +83,8 @@ public class TestReplicationPolicy {
|
||||||
"test.build.data", "build/test/data"), "dfs/");
|
"test.build.data", "build/test/data"), "dfs/");
|
||||||
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
|
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
|
||||||
new File(baseDir, "name").getPath());
|
new File(baseDir, "name").getPath());
|
||||||
|
// Enable the checking for stale datanodes in the beginning
|
||||||
|
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, true);
|
||||||
|
|
||||||
DFSTestUtil.formatNameNode(conf);
|
DFSTestUtil.formatNameNode(conf);
|
||||||
namenode = new NameNode(conf);
|
namenode = new NameNode(conf);
|
||||||
|
@ -370,6 +378,202 @@ public class TestReplicationPolicy {
|
||||||
assertFalse(cluster.isOnSameRack(targets[0], targets[1]));
|
assertFalse(cluster.isOnSameRack(targets[0], targets[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean containsWithinRange(DatanodeDescriptor target,
|
||||||
|
DatanodeDescriptor[] nodes, int startIndex, int endIndex) {
|
||||||
|
assert startIndex >= 0 && startIndex < nodes.length;
|
||||||
|
assert endIndex >= startIndex && endIndex < nodes.length;
|
||||||
|
for (int i = startIndex; i <= endIndex; i++) {
|
||||||
|
if (nodes[i].equals(target)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testChooseTargetWithStaleNodes() throws Exception {
|
||||||
|
// Enable avoidng writing to stale datanodes
|
||||||
|
namenode.getNamesystem().getBlockManager().getDatanodeManager()
|
||||||
|
.setAvoidStaleDataNodesForWrite(true);
|
||||||
|
// Set dataNodes[0] as stale
|
||||||
|
dataNodes[0].setLastUpdate(Time.now() - staleInterval - 1);
|
||||||
|
|
||||||
|
DatanodeDescriptor[] targets;
|
||||||
|
// We set the datanode[0] as stale, thus should choose datanode[1] since
|
||||||
|
// datanode[1] is on the same rack with datanode[0] (writer)
|
||||||
|
targets = replicator.chooseTarget(filename, 1, dataNodes[0],
|
||||||
|
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 1);
|
||||||
|
assertEquals(targets[0], dataNodes[1]);
|
||||||
|
|
||||||
|
HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>();
|
||||||
|
excludedNodes.put(dataNodes[1], dataNodes[1]);
|
||||||
|
List<DatanodeDescriptor> chosenNodes = new ArrayList<DatanodeDescriptor>();
|
||||||
|
BlockPlacementPolicyDefault repl = (BlockPlacementPolicyDefault)replicator;
|
||||||
|
targets = chooseTarget(repl, 1, dataNodes[0], chosenNodes, excludedNodes,
|
||||||
|
BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 1);
|
||||||
|
assertFalse(cluster.isOnSameRack(targets[0], dataNodes[0]));
|
||||||
|
|
||||||
|
// reset
|
||||||
|
namenode.getNamesystem().getBlockManager().getDatanodeManager()
|
||||||
|
.setAvoidStaleDataNodesForWrite(false);
|
||||||
|
dataNodes[0].setLastUpdate(Time.now());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* In this testcase, we set 3 nodes (dataNodes[0] ~ dataNodes[2]) as stale,
|
||||||
|
* and when the number of replicas is less or equal to 3, all the healthy
|
||||||
|
* datanodes should be returned by the chooseTarget method. When the number
|
||||||
|
* of replicas is 4, a stale node should be included.
|
||||||
|
*
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testChooseTargetWithHalfStaleNodes() throws Exception {
|
||||||
|
// Enable stale datanodes checking
|
||||||
|
namenode.getNamesystem().getBlockManager().getDatanodeManager()
|
||||||
|
.setAvoidStaleDataNodesForWrite(true);
|
||||||
|
// Set dataNodes[0], dataNodes[1], and dataNodes[2] as stale
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
dataNodes[i].setLastUpdate(Time.now() - staleInterval - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
DatanodeDescriptor[] targets;
|
||||||
|
targets = replicator.chooseTarget(filename, 0, dataNodes[0],
|
||||||
|
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 0);
|
||||||
|
|
||||||
|
// We set the datanode[0] as stale, thus should choose datanode[1]
|
||||||
|
targets = replicator.chooseTarget(filename, 1, dataNodes[0],
|
||||||
|
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 1);
|
||||||
|
assertFalse(containsWithinRange(targets[0], dataNodes, 0, 2));
|
||||||
|
|
||||||
|
targets = replicator.chooseTarget(filename, 2, dataNodes[0],
|
||||||
|
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 2);
|
||||||
|
assertFalse(containsWithinRange(targets[0], dataNodes, 0, 2));
|
||||||
|
assertFalse(containsWithinRange(targets[1], dataNodes, 0, 2));
|
||||||
|
|
||||||
|
targets = replicator.chooseTarget(filename, 3, dataNodes[0],
|
||||||
|
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 3);
|
||||||
|
assertTrue(containsWithinRange(targets[0], dataNodes, 3, 5));
|
||||||
|
assertTrue(containsWithinRange(targets[1], dataNodes, 3, 5));
|
||||||
|
assertTrue(containsWithinRange(targets[2], dataNodes, 3, 5));
|
||||||
|
|
||||||
|
targets = replicator.chooseTarget(filename, 4, dataNodes[0],
|
||||||
|
new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 4);
|
||||||
|
assertTrue(containsWithinRange(dataNodes[3], targets, 0, 3));
|
||||||
|
assertTrue(containsWithinRange(dataNodes[4], targets, 0, 3));
|
||||||
|
assertTrue(containsWithinRange(dataNodes[5], targets, 0, 3));
|
||||||
|
|
||||||
|
// reset
|
||||||
|
namenode.getNamesystem().getBlockManager().getDatanodeManager()
|
||||||
|
.setAvoidStaleDataNodesForWrite(false);
|
||||||
|
for (int i = 0; i < dataNodes.length; i++) {
|
||||||
|
dataNodes[i].setLastUpdate(Time.now());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testChooseTargetWithMoreThanHalfStaleNodes() throws Exception {
|
||||||
|
HdfsConfiguration conf = new HdfsConfiguration();
|
||||||
|
conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, true);
|
||||||
|
conf.setBoolean(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_AVOID_STALE_DATANODE_FOR_WRITE_KEY, true);
|
||||||
|
String[] hosts = new String[]{"host1", "host2", "host3",
|
||||||
|
"host4", "host5", "host6"};
|
||||||
|
String[] racks = new String[]{"/d1/r1", "/d1/r1", "/d1/r2",
|
||||||
|
"/d1/r2", "/d2/r3", "/d2/r3"};
|
||||||
|
MiniDFSCluster miniCluster = new MiniDFSCluster.Builder(conf).racks(racks)
|
||||||
|
.hosts(hosts).numDataNodes(hosts.length).build();
|
||||||
|
miniCluster.waitActive();
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Step 1. Make two datanodes as stale, check whether the
|
||||||
|
// avoidStaleDataNodesForWrite calculation is correct.
|
||||||
|
// First stop the heartbeat of host1 and host2
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
DataNode dn = miniCluster.getDataNodes().get(i);
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true);
|
||||||
|
miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getDatanode(dn.getDatanodeId())
|
||||||
|
.setLastUpdate(Time.now() - staleInterval - 1);
|
||||||
|
}
|
||||||
|
// Instead of waiting, explicitly call heartbeatCheck to
|
||||||
|
// let heartbeat manager to detect stale nodes
|
||||||
|
miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
|
||||||
|
int numStaleNodes = miniCluster.getNameNode().getNamesystem()
|
||||||
|
.getBlockManager().getDatanodeManager().getNumStaleNodes();
|
||||||
|
assertEquals(numStaleNodes, 2);
|
||||||
|
assertTrue(miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().isAvoidingStaleDataNodesForWrite());
|
||||||
|
// Call chooseTarget
|
||||||
|
DatanodeDescriptor staleNodeInfo = miniCluster.getNameNode()
|
||||||
|
.getNamesystem().getBlockManager().getDatanodeManager()
|
||||||
|
.getDatanode(miniCluster.getDataNodes().get(0).getDatanodeId());
|
||||||
|
BlockPlacementPolicy replicator = miniCluster.getNameNode()
|
||||||
|
.getNamesystem().getBlockManager().getBlockPlacementPolicy();
|
||||||
|
DatanodeDescriptor[] targets = replicator.chooseTarget(filename, 3,
|
||||||
|
staleNodeInfo, new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 3);
|
||||||
|
assertFalse(cluster.isOnSameRack(targets[0], staleNodeInfo));
|
||||||
|
|
||||||
|
// Step 2. Set more than half of the datanodes as stale
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
DataNode dn = miniCluster.getDataNodes().get(i);
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true);
|
||||||
|
miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getDatanode(dn.getDatanodeId())
|
||||||
|
.setLastUpdate(Time.now() - staleInterval - 1);
|
||||||
|
}
|
||||||
|
// Explicitly call heartbeatCheck
|
||||||
|
miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
|
||||||
|
numStaleNodes = miniCluster.getNameNode().getNamesystem()
|
||||||
|
.getBlockManager().getDatanodeManager().getNumStaleNodes();
|
||||||
|
assertEquals(numStaleNodes, 4);
|
||||||
|
// According to our strategy, stale datanodes will be included for writing
|
||||||
|
// to avoid hotspots
|
||||||
|
assertFalse(miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().isAvoidingStaleDataNodesForWrite());
|
||||||
|
// Call chooseTarget
|
||||||
|
targets = replicator.chooseTarget(filename, 3,
|
||||||
|
staleNodeInfo, new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 3);
|
||||||
|
assertTrue(cluster.isOnSameRack(targets[0], staleNodeInfo));
|
||||||
|
|
||||||
|
// Step 3. Set 2 stale datanodes back to healthy nodes,
|
||||||
|
// still have 2 stale nodes
|
||||||
|
for (int i = 2; i < 4; i++) {
|
||||||
|
DataNode dn = miniCluster.getDataNodes().get(i);
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false);
|
||||||
|
miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getDatanode(dn.getDatanodeId())
|
||||||
|
.setLastUpdate(Time.now());
|
||||||
|
}
|
||||||
|
// Explicitly call heartbeatCheck
|
||||||
|
miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
|
||||||
|
numStaleNodes = miniCluster.getNameNode().getNamesystem()
|
||||||
|
.getBlockManager().getDatanodeManager().getNumStaleNodes();
|
||||||
|
assertEquals(numStaleNodes, 2);
|
||||||
|
assertTrue(miniCluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().isAvoidingStaleDataNodesForWrite());
|
||||||
|
// Call chooseTarget
|
||||||
|
targets = replicator.chooseTarget(filename, 3,
|
||||||
|
staleNodeInfo, new ArrayList<DatanodeDescriptor>(), BLOCK_SIZE);
|
||||||
|
assertEquals(targets.length, 3);
|
||||||
|
assertFalse(cluster.isOnSameRack(targets[0], staleNodeInfo));
|
||||||
|
} finally {
|
||||||
|
miniCluster.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This testcase tests re-replication, when dataNodes[0] is already chosen.
|
* This testcase tests re-replication, when dataNodes[0] is already chosen.
|
||||||
* So the 1st replica can be placed on random rack.
|
* So the 1st replica can be placed on random rack.
|
||||||
|
@ -490,8 +694,8 @@ public class TestReplicationPolicy {
|
||||||
.format(true).build();
|
.format(true).build();
|
||||||
try {
|
try {
|
||||||
cluster.waitActive();
|
cluster.waitActive();
|
||||||
final UnderReplicatedBlocks neededReplications = (UnderReplicatedBlocks) cluster
|
final UnderReplicatedBlocks neededReplications = cluster.getNameNode()
|
||||||
.getNameNode().getNamesystem().getBlockManager().neededReplications;
|
.getNamesystem().getBlockManager().neededReplications;
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
// Adding the blocks directly to normal priority
|
// Adding the blocks directly to normal priority
|
||||||
neededReplications.add(new Block(random.nextLong()), 2, 0, 3);
|
neededReplications.add(new Block(random.nextLong()), 2, 0, 3);
|
||||||
|
@ -529,10 +733,10 @@ public class TestReplicationPolicy {
|
||||||
// Adding QUEUE_VERY_UNDER_REPLICATED block
|
// Adding QUEUE_VERY_UNDER_REPLICATED block
|
||||||
underReplicatedBlocks.add(new Block(random.nextLong()), 2, 0, 7);
|
underReplicatedBlocks.add(new Block(random.nextLong()), 2, 0, 7);
|
||||||
|
|
||||||
// Adding QUEUE_UNDER_REPLICATED block
|
// Adding QUEUE_REPLICAS_BADLY_DISTRIBUTED block
|
||||||
underReplicatedBlocks.add(new Block(random.nextLong()), 6, 0, 6);
|
underReplicatedBlocks.add(new Block(random.nextLong()), 6, 0, 6);
|
||||||
|
|
||||||
// Adding QUEUE_REPLICAS_BADLY_DISTRIBUTED block
|
// Adding QUEUE_UNDER_REPLICATED block
|
||||||
underReplicatedBlocks.add(new Block(random.nextLong()), 5, 0, 6);
|
underReplicatedBlocks.add(new Block(random.nextLong()), 5, 0, 6);
|
||||||
|
|
||||||
// Adding QUEUE_WITH_CORRUPT_BLOCKS block
|
// Adding QUEUE_WITH_CORRUPT_BLOCKS block
|
||||||
|
@ -618,6 +822,11 @@ public class TestReplicationPolicy {
|
||||||
dataNodes[5].setRemaining(1*1024*1024);
|
dataNodes[5].setRemaining(1*1024*1024);
|
||||||
replicaNodeList.add(dataNodes[5]);
|
replicaNodeList.add(dataNodes[5]);
|
||||||
|
|
||||||
|
// Refresh the last update time for all the datanodes
|
||||||
|
for (int i = 0; i < dataNodes.length; i++) {
|
||||||
|
dataNodes[i].setLastUpdate(Time.now());
|
||||||
|
}
|
||||||
|
|
||||||
List<DatanodeDescriptor> first = new ArrayList<DatanodeDescriptor>();
|
List<DatanodeDescriptor> first = new ArrayList<DatanodeDescriptor>();
|
||||||
List<DatanodeDescriptor> second = new ArrayList<DatanodeDescriptor>();
|
List<DatanodeDescriptor> second = new ArrayList<DatanodeDescriptor>();
|
||||||
replicator.splitNodesWithRack(
|
replicator.splitNodesWithRack(
|
||||||
|
|
Loading…
Reference in New Issue