svn merge -c 1611731 from trunk for HDFS-6700. BlockPlacementPolicy shoud choose storage but not datanode for deletion.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1611734 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2014-07-18 17:51:39 +00:00
parent 325a7d5d94
commit 9462e9e0e9
9 changed files with 179 additions and 149 deletions

View File

@ -49,6 +49,9 @@ Release 2.6.0 - UNRELEASED
HDFS-6597. Add a new option to NN upgrade to terminate the process after
upgrade on NN is completed. (Danilo Vunjak via cnauroth)
HDFS-6700. BlockPlacementPolicy shoud choose storage but not datanode for
deletion. (szetszwo)
OPTIMIZATIONS
HDFS-6690. Deduplicate xattr names in memory. (wang)

View File

@ -725,7 +725,6 @@ public class BlockManager {
final List<DatanodeStorageInfo> locations
= new ArrayList<DatanodeStorageInfo>(blocksMap.numNodes(block));
for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
final String storageID = storage.getStorageID();
// filter invalidate replicas
if(!invalidateBlocks.contains(storage.getDatanodeDescriptor(), block)) {
locations.add(storage);
@ -2640,7 +2639,7 @@ public class BlockManager {
if (addedNode == delNodeHint) {
delNodeHint = null;
}
Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>();
Collection<DatanodeStorageInfo> nonExcess = new ArrayList<DatanodeStorageInfo>();
Collection<DatanodeDescriptor> corruptNodes = corruptReplicas
.getNodes(block);
for(DatanodeStorageInfo storage : blocksMap.getStorages(block, State.NORMAL)) {
@ -2660,7 +2659,7 @@ public class BlockManager {
if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
// exclude corrupt replicas
if (corruptNodes == null || !corruptNodes.contains(cur)) {
nonExcess.add(cur);
nonExcess.add(storage);
}
}
}
@ -2684,7 +2683,7 @@ public class BlockManager {
* If no such a node is available,
* then pick a node with least free space
*/
private void chooseExcessReplicates(Collection<DatanodeDescriptor> nonExcess,
private void chooseExcessReplicates(final Collection<DatanodeStorageInfo> nonExcess,
Block b, short replication,
DatanodeDescriptor addedNode,
DatanodeDescriptor delNodeHint,
@ -2692,28 +2691,33 @@ public class BlockManager {
assert namesystem.hasWriteLock();
// first form a rack to datanodes map and
BlockCollection bc = getBlockCollection(b);
final Map<String, List<DatanodeDescriptor>> rackMap
= new HashMap<String, List<DatanodeDescriptor>>();
final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
final Map<String, List<DatanodeStorageInfo>> rackMap
= new HashMap<String, List<DatanodeStorageInfo>>();
final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>();
final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>();
// split nodes into two sets
// moreThanOne contains nodes on rack with more than one replica
// exactlyOne contains the remaining nodes
replicator.splitNodesWithRack(nonExcess, rackMap, moreThanOne,
exactlyOne);
replicator.splitNodesWithRack(nonExcess, rackMap, moreThanOne, exactlyOne);
// pick one node to delete that favors the delete hint
// otherwise pick one with least space from priSet if it is not empty
// otherwise one node with least space from remains
boolean firstOne = true;
final DatanodeStorageInfo delNodeHintStorage
= DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, delNodeHint);
final DatanodeStorageInfo addedNodeStorage
= DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, addedNode);
while (nonExcess.size() - replication > 0) {
// check if we can delete delNodeHint
final DatanodeInfo cur;
if (firstOne && delNodeHint !=null && nonExcess.contains(delNodeHint)
&& (moreThanOne.contains(delNodeHint)
|| (addedNode != null && !moreThanOne.contains(addedNode))) ) {
cur = delNodeHint;
final DatanodeStorageInfo cur;
if (firstOne && delNodeHintStorage != null
&& (moreThanOne.contains(delNodeHintStorage)
|| (addedNodeStorage != null
&& !moreThanOne.contains(addedNodeStorage)))) {
cur = delNodeHintStorage;
} else { // regular excessive replica removal
cur = replicator.chooseReplicaToDelete(bc, b, replication,
moreThanOne, exactlyOne);
@ -2725,7 +2729,7 @@ public class BlockManager {
exactlyOne, cur);
nonExcess.remove(cur);
addToExcessReplicate(cur, b);
addToExcessReplicate(cur.getDatanodeDescriptor(), b);
//
// The 'excessblocks' tracks blocks until we get confirmation
@ -2736,7 +2740,7 @@ public class BlockManager {
// should be deleted. Items are removed from the invalidate list
// upon giving instructions to the namenode.
//
addToInvalidates(b, cur);
addToInvalidates(b, cur.getDatanodeDescriptor());
blockLog.info("BLOCK* chooseExcessReplicates: "
+"("+cur+", "+b+") is added to invalidated blocks set");
}

View File

@ -124,11 +124,12 @@ public abstract class BlockPlacementPolicy {
listed in the previous parameter.
* @return the replica that is the best candidate for deletion
*/
abstract public DatanodeDescriptor chooseReplicaToDelete(BlockCollection srcBC,
Block block,
short replicationFactor,
Collection<DatanodeDescriptor> existingReplicas,
Collection<DatanodeDescriptor> moreExistingReplicas);
abstract public DatanodeStorageInfo chooseReplicaToDelete(
BlockCollection srcBC,
Block block,
short replicationFactor,
Collection<DatanodeStorageInfo> existingReplicas,
Collection<DatanodeStorageInfo> moreExistingReplicas);
/**
* Used to setup a BlockPlacementPolicy object. This should be defined by
@ -175,21 +176,23 @@ public abstract class BlockPlacementPolicy {
* @param exactlyOne The List of replica nodes on rack with only one replica
* @param cur current replica to remove
*/
public void adjustSetsWithChosenReplica(final Map<String,
List<DatanodeDescriptor>> rackMap,
final List<DatanodeDescriptor> moreThanOne,
final List<DatanodeDescriptor> exactlyOne, final DatanodeInfo cur) {
public void adjustSetsWithChosenReplica(
final Map<String, List<DatanodeStorageInfo>> rackMap,
final List<DatanodeStorageInfo> moreThanOne,
final List<DatanodeStorageInfo> exactlyOne,
final DatanodeStorageInfo cur) {
String rack = getRack(cur);
final List<DatanodeDescriptor> datanodes = rackMap.get(rack);
datanodes.remove(cur);
if (datanodes.isEmpty()) {
final String rack = getRack(cur.getDatanodeDescriptor());
final List<DatanodeStorageInfo> storages = rackMap.get(rack);
storages.remove(cur);
if (storages.isEmpty()) {
rackMap.remove(rack);
}
if (moreThanOne.remove(cur)) {
if (datanodes.size() == 1) {
moreThanOne.remove(datanodes.get(0));
exactlyOne.add(datanodes.get(0));
if (storages.size() == 1) {
final DatanodeStorageInfo remaining = storages.get(0);
moreThanOne.remove(remaining);
exactlyOne.add(remaining);
}
} else {
exactlyOne.remove(cur);
@ -214,28 +217,28 @@ public abstract class BlockPlacementPolicy {
* @param exactlyOne remains contains the remaining nodes
*/
public void splitNodesWithRack(
Collection<DatanodeDescriptor> dataNodes,
final Map<String, List<DatanodeDescriptor>> rackMap,
final List<DatanodeDescriptor> moreThanOne,
final List<DatanodeDescriptor> exactlyOne) {
for(DatanodeDescriptor node : dataNodes) {
final String rackName = getRack(node);
List<DatanodeDescriptor> datanodeList = rackMap.get(rackName);
if (datanodeList == null) {
datanodeList = new ArrayList<DatanodeDescriptor>();
rackMap.put(rackName, datanodeList);
final Iterable<DatanodeStorageInfo> storages,
final Map<String, List<DatanodeStorageInfo>> rackMap,
final List<DatanodeStorageInfo> moreThanOne,
final List<DatanodeStorageInfo> exactlyOne) {
for(DatanodeStorageInfo s: storages) {
final String rackName = getRack(s.getDatanodeDescriptor());
List<DatanodeStorageInfo> storageList = rackMap.get(rackName);
if (storageList == null) {
storageList = new ArrayList<DatanodeStorageInfo>();
rackMap.put(rackName, storageList);
}
datanodeList.add(node);
storageList.add(s);
}
// split nodes into two sets
for(List<DatanodeDescriptor> datanodeList : rackMap.values()) {
if (datanodeList.size() == 1) {
for(List<DatanodeStorageInfo> storageList : rackMap.values()) {
if (storageList.size() == 1) {
// exactlyOne contains nodes on rack with only one replica
exactlyOne.add(datanodeList.get(0));
exactlyOne.add(storageList.get(0));
} else {
// moreThanOne contains nodes on rack with more than one replica
moreThanOne.addAll(datanodeList);
moreThanOne.addAll(storageList);
}
}
}

View File

@ -727,31 +727,34 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
}
@Override
public DatanodeDescriptor chooseReplicaToDelete(BlockCollection bc,
public DatanodeStorageInfo chooseReplicaToDelete(BlockCollection bc,
Block block, short replicationFactor,
Collection<DatanodeDescriptor> first,
Collection<DatanodeDescriptor> second) {
Collection<DatanodeStorageInfo> first,
Collection<DatanodeStorageInfo> second) {
long oldestHeartbeat =
now() - heartbeatInterval * tolerateHeartbeatMultiplier;
DatanodeDescriptor oldestHeartbeatNode = null;
DatanodeStorageInfo oldestHeartbeatStorage = null;
long minSpace = Long.MAX_VALUE;
DatanodeDescriptor minSpaceNode = null;
DatanodeStorageInfo minSpaceStorage = null;
// Pick the node with the oldest heartbeat or with the least free space,
// if all hearbeats are within the tolerable heartbeat interval
for(DatanodeDescriptor node : pickupReplicaSet(first, second)) {
for(DatanodeStorageInfo storage : pickupReplicaSet(first, second)) {
final DatanodeDescriptor node = storage.getDatanodeDescriptor();
long free = node.getRemaining();
long lastHeartbeat = node.getLastUpdate();
if(lastHeartbeat < oldestHeartbeat) {
oldestHeartbeat = lastHeartbeat;
oldestHeartbeatNode = node;
oldestHeartbeatStorage = storage;
}
if (minSpace > free) {
minSpace = free;
minSpaceNode = node;
minSpaceStorage = storage;
}
}
return oldestHeartbeatNode != null ? oldestHeartbeatNode : minSpaceNode;
return oldestHeartbeatStorage != null? oldestHeartbeatStorage
: minSpaceStorage;
}
/**
@ -760,9 +763,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
* replica while second set contains remaining replica nodes.
* So pick up first set if not empty. If first is empty, then pick second.
*/
protected Collection<DatanodeDescriptor> pickupReplicaSet(
Collection<DatanodeDescriptor> first,
Collection<DatanodeDescriptor> second) {
protected Collection<DatanodeStorageInfo> pickupReplicaSet(
Collection<DatanodeStorageInfo> first,
Collection<DatanodeStorageInfo> second) {
return first.isEmpty() ? second : first;
}

View File

@ -288,9 +288,9 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
* If first is empty, then pick second.
*/
@Override
public Collection<DatanodeDescriptor> pickupReplicaSet(
Collection<DatanodeDescriptor> first,
Collection<DatanodeDescriptor> second) {
public Collection<DatanodeStorageInfo> pickupReplicaSet(
Collection<DatanodeStorageInfo> first,
Collection<DatanodeStorageInfo> second) {
// If no replica within same rack, return directly.
if (first.isEmpty()) {
return second;
@ -298,25 +298,24 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
// Split data nodes in the first set into two sets,
// moreThanOne contains nodes on nodegroup with more than one replica
// exactlyOne contains the remaining nodes
Map<String, List<DatanodeDescriptor>> nodeGroupMap =
new HashMap<String, List<DatanodeDescriptor>>();
Map<String, List<DatanodeStorageInfo>> nodeGroupMap =
new HashMap<String, List<DatanodeStorageInfo>>();
for(DatanodeDescriptor node : first) {
final String nodeGroupName =
NetworkTopology.getLastHalf(node.getNetworkLocation());
List<DatanodeDescriptor> datanodeList =
nodeGroupMap.get(nodeGroupName);
if (datanodeList == null) {
datanodeList = new ArrayList<DatanodeDescriptor>();
nodeGroupMap.put(nodeGroupName, datanodeList);
for(DatanodeStorageInfo storage : first) {
final String nodeGroupName = NetworkTopology.getLastHalf(
storage.getDatanodeDescriptor().getNetworkLocation());
List<DatanodeStorageInfo> storageList = nodeGroupMap.get(nodeGroupName);
if (storageList == null) {
storageList = new ArrayList<DatanodeStorageInfo>();
nodeGroupMap.put(nodeGroupName, storageList);
}
datanodeList.add(node);
storageList.add(storage);
}
final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>();
final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>();
// split nodes into two sets
for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
for(List<DatanodeStorageInfo> datanodeList : nodeGroupMap.values()) {
if (datanodeList.size() == 1 ) {
// exactlyOne contains nodes on nodegroup with exactly one replica
exactlyOne.add(datanodeList.get(0));

View File

@ -22,6 +22,7 @@ import java.util.Iterator;
import java.util.List;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
@ -290,4 +291,21 @@ public class DatanodeStorageInfo {
public String toString() {
return "[" + storageType + "]" + storageID + ":" + state;
}
/** @return the first {@link DatanodeStorageInfo} corresponding to
* the given datanode
*/
static DatanodeStorageInfo getDatanodeStorageInfo(
final Iterable<DatanodeStorageInfo> infos,
final DatanodeDescriptor datanode) {
if (datanode == null) {
return null;
}
for(DatanodeStorageInfo storage : infos) {
if (storage.getDatanodeDescriptor() == datanode) {
return storage;
}
}
return null;
}
}

View File

@ -893,6 +893,54 @@ public class TestReplicationPolicy {
UnderReplicatedBlocks.QUEUE_WITH_CORRUPT_BLOCKS).size());
}
/**
* Test for the chooseReplicaToDelete are processed based on
* block locality and free space
*/
@Test
public void testChooseReplicaToDelete() throws Exception {
List<DatanodeStorageInfo> replicaList = new ArrayList<DatanodeStorageInfo>();
final Map<String, List<DatanodeStorageInfo>> rackMap
= new HashMap<String, List<DatanodeStorageInfo>>();
dataNodes[0].setRemaining(4*1024*1024);
replicaList.add(storages[0]);
dataNodes[1].setRemaining(3*1024*1024);
replicaList.add(storages[1]);
dataNodes[2].setRemaining(2*1024*1024);
replicaList.add(storages[2]);
dataNodes[5].setRemaining(1*1024*1024);
replicaList.add(storages[5]);
// Refresh the last update time for all the datanodes
for (int i = 0; i < dataNodes.length; i++) {
dataNodes[i].setLastUpdate(Time.now());
}
List<DatanodeStorageInfo> first = new ArrayList<DatanodeStorageInfo>();
List<DatanodeStorageInfo> second = new ArrayList<DatanodeStorageInfo>();
replicator.splitNodesWithRack(replicaList, rackMap, first, second);
// storages[0] and storages[1] are in first set as their rack has two
// replica nodes, while storages[2] and dataNodes[5] are in second set.
assertEquals(2, first.size());
assertEquals(2, second.size());
DatanodeStorageInfo chosen = replicator.chooseReplicaToDelete(
null, null, (short)3, first, second);
// Within first set, storages[1] with less free space
assertEquals(chosen, storages[1]);
replicator.adjustSetsWithChosenReplica(rackMap, first, second, chosen);
assertEquals(0, first.size());
assertEquals(3, second.size());
// Within second set, storages[5] with less free space
chosen = replicator.chooseReplicaToDelete(
null, null, (short)2, first, second);
assertEquals(chosen, storages[5]);
}
/**
* This testcase tests whether the default value returned by
* DFSUtil.getInvalidateWorkPctPerIteration() is positive,
@ -980,50 +1028,4 @@ public class TestReplicationPolicy {
exception.expect(IllegalArgumentException.class);
blocksReplWorkMultiplier = DFSUtil.getReplWorkMultiplier(conf);
}
/**
* Test for the chooseReplicaToDelete are processed based on
* block locality and free space
*/
@Test
public void testChooseReplicaToDelete() throws Exception {
List<DatanodeDescriptor> replicaNodeList = new
ArrayList<DatanodeDescriptor>();
final Map<String, List<DatanodeDescriptor>> rackMap
= new HashMap<String, List<DatanodeDescriptor>>();
dataNodes[0].setRemaining(4*1024*1024);
replicaNodeList.add(dataNodes[0]);
dataNodes[1].setRemaining(3*1024*1024);
replicaNodeList.add(dataNodes[1]);
dataNodes[2].setRemaining(2*1024*1024);
replicaNodeList.add(dataNodes[2]);
dataNodes[5].setRemaining(1*1024*1024);
replicaNodeList.add(dataNodes[5]);
List<DatanodeDescriptor> first = new ArrayList<DatanodeDescriptor>();
List<DatanodeDescriptor> second = new ArrayList<DatanodeDescriptor>();
replicator.splitNodesWithRack(
replicaNodeList, rackMap, first, second);
// dataNodes[0] and dataNodes[1] are in first set as their rack has two
// replica nodes, while datanodes[2] and dataNodes[5] are in second set.
assertEquals(2, first.size());
assertEquals(2, second.size());
DatanodeDescriptor chosenNode = replicator.chooseReplicaToDelete(
null, null, (short)3, first, second);
// Within first set, dataNodes[1] with less free space
assertEquals(chosenNode, dataNodes[1]);
replicator.adjustSetsWithChosenReplica(
rackMap, first, second, chosenNode);
assertEquals(0, first.size());
assertEquals(3, second.size());
// Within second set, dataNodes[5] with less free space
chosenNode = replicator.chooseReplicaToDelete(
null, null, (short)2, first, second);
assertEquals(chosenNode, dataNodes[5]);
}
}

View File

@ -591,51 +591,50 @@ public class TestReplicationPolicyWithNodeGroup {
*/
@Test
public void testChooseReplicaToDelete() throws Exception {
List<DatanodeDescriptor> replicaNodeList =
new ArrayList<DatanodeDescriptor>();
final Map<String, List<DatanodeDescriptor>> rackMap =
new HashMap<String, List<DatanodeDescriptor>>();
List<DatanodeStorageInfo> replicaList = new ArrayList<DatanodeStorageInfo>();
final Map<String, List<DatanodeStorageInfo>> rackMap
= new HashMap<String, List<DatanodeStorageInfo>>();
dataNodes[0].setRemaining(4*1024*1024);
replicaNodeList.add(dataNodes[0]);
replicaList.add(storages[0]);
dataNodes[1].setRemaining(3*1024*1024);
replicaNodeList.add(dataNodes[1]);
replicaList.add(storages[1]);
dataNodes[2].setRemaining(2*1024*1024);
replicaNodeList.add(dataNodes[2]);
replicaList.add(storages[2]);
dataNodes[5].setRemaining(1*1024*1024);
replicaNodeList.add(dataNodes[5]);
replicaList.add(storages[5]);
List<DatanodeDescriptor> first = new ArrayList<DatanodeDescriptor>();
List<DatanodeDescriptor> second = new ArrayList<DatanodeDescriptor>();
List<DatanodeStorageInfo> first = new ArrayList<DatanodeStorageInfo>();
List<DatanodeStorageInfo> second = new ArrayList<DatanodeStorageInfo>();
replicator.splitNodesWithRack(
replicaNodeList, rackMap, first, second);
replicaList, rackMap, first, second);
assertEquals(3, first.size());
assertEquals(1, second.size());
DatanodeDescriptor chosenNode = replicator.chooseReplicaToDelete(
DatanodeStorageInfo chosen = replicator.chooseReplicaToDelete(
null, null, (short)3, first, second);
// Within first set {dataNodes[0], dataNodes[1], dataNodes[2]},
// dataNodes[0] and dataNodes[1] are in the same nodegroup,
// but dataNodes[1] is chosen as less free space
assertEquals(chosenNode, dataNodes[1]);
assertEquals(chosen, storages[1]);
replicator.adjustSetsWithChosenReplica(rackMap, first, second, chosenNode);
replicator.adjustSetsWithChosenReplica(rackMap, first, second, chosen);
assertEquals(2, first.size());
assertEquals(1, second.size());
// Within first set {dataNodes[0], dataNodes[2]}, dataNodes[2] is chosen
// as less free space
chosenNode = replicator.chooseReplicaToDelete(
chosen = replicator.chooseReplicaToDelete(
null, null, (short)2, first, second);
assertEquals(chosenNode, dataNodes[2]);
assertEquals(chosen, storages[2]);
replicator.adjustSetsWithChosenReplica(rackMap, first, second, chosenNode);
replicator.adjustSetsWithChosenReplica(rackMap, first, second, chosen);
assertEquals(0, first.size());
assertEquals(2, second.size());
// Within second set, dataNodes[5] with less free space
chosenNode = replicator.chooseReplicaToDelete(
chosen = replicator.chooseReplicaToDelete(
null, null, (short)1, first, second);
assertEquals(chosenNode, dataNodes[5]);
assertEquals(chosen, storages[5]);
}
/**

View File

@ -46,7 +46,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
@ -585,15 +585,14 @@ public class TestDNFencing {
}
@Override
public DatanodeDescriptor chooseReplicaToDelete(BlockCollection inode,
public DatanodeStorageInfo chooseReplicaToDelete(BlockCollection inode,
Block block, short replicationFactor,
Collection<DatanodeDescriptor> first,
Collection<DatanodeDescriptor> second) {
Collection<DatanodeStorageInfo> first,
Collection<DatanodeStorageInfo> second) {
Collection<DatanodeDescriptor> chooseFrom =
!first.isEmpty() ? first : second;
Collection<DatanodeStorageInfo> chooseFrom = !first.isEmpty() ? first : second;
List<DatanodeDescriptor> l = Lists.newArrayList(chooseFrom);
List<DatanodeStorageInfo> l = Lists.newArrayList(chooseFrom);
return l.get(DFSUtil.getRandom().nextInt(l.size()));
}
}