HADOOP-8468. Add NetworkTopologyWithNodeGroup, a 4-layer implementation of NetworkTopology. Contributed by Junping Du

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1351163 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2012-06-17 21:12:25 +00:00
parent 5572f73d45
commit 19ac5c4e8b
4 changed files with 575 additions and 3 deletions

View File

@ -12,6 +12,9 @@ Trunk (unreleased changes)
HADOOP-8469. Make NetworkTopology class pluggable. (Junping Du via
szetszwo)
HADOOP-8468. Add NetworkTopologyWithNodeGroup, a 4-layer implementation
of NetworkTopology. (Junping Du via szetszwo)
IMPROVEMENTS
HADOOP-8017. Configure hadoop-main pom to get rid of M2E plugin execution

View File

@ -0,0 +1,398 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.net;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
/**
* The class extends NetworkTopology to represents a cluster of computer with
* a 4-layers hierarchical network topology.
* In this network topology, leaves represent data nodes (computers) and inner
* nodes represent switches/routers that manage traffic in/out of data centers,
* racks or physical host (with virtual switch).
*
* @see NetworkTopology
*/
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
@InterfaceStability.Unstable
public class NetworkTopologyWithNodeGroup extends NetworkTopology {
public final static String DEFAULT_NODEGROUP = "/default-nodegroup";
public NetworkTopologyWithNodeGroup() {
clusterMap = new InnerNodeWithNodeGroup(InnerNode.ROOT);
}
@Override
protected Node getNodeForNetworkLocation(Node node) {
// if node only with default rack info, here we need to add default
// nodegroup info
if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) {
node.setNetworkLocation(node.getNetworkLocation()
+ DEFAULT_NODEGROUP);
}
Node nodeGroup = getNode(node.getNetworkLocation());
if (nodeGroup == null) {
nodeGroup = new InnerNode(node.getNetworkLocation());
}
return getNode(nodeGroup.getNetworkLocation());
}
@Override
public String getRack(String loc) {
netlock.readLock().lock();
try {
loc = InnerNode.normalize(loc);
Node locNode = getNode(loc);
if (locNode instanceof InnerNodeWithNodeGroup) {
InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode;
if (node.isRack()) {
return loc;
} else if (node.isNodeGroup()) {
return node.getNetworkLocation();
} else {
// may be a data center
return null;
}
} else {
// not in cluster map, don't handle it
return loc;
}
} finally {
netlock.readLock().unlock();
}
}
/**
* Given a string representation of a node group for a specific network
* location
*
* @param loc
* a path-like string representation of a network location
* @return a node group string
*/
public String getNodeGroup(String loc) {
netlock.readLock().lock();
try {
loc = InnerNode.normalize(loc);
Node locNode = getNode(loc);
if (locNode instanceof InnerNodeWithNodeGroup) {
InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode;
if (node.isNodeGroup()) {
return loc;
} else if (node.isRack()) {
// not sure the node group for a rack
return null;
} else {
// may be a leaf node
return getNodeGroup(node.getNetworkLocation());
}
} else {
// not in cluster map, don't handle it
return loc;
}
} finally {
netlock.readLock().unlock();
}
}
@Override
public boolean isOnSameRack( Node node1, Node node2) {
if (node1 == null || node2 == null ||
node1.getParent() == null || node2.getParent() == null) {
return false;
}
netlock.readLock().lock();
try {
return isSameParents(node1.getParent(), node2.getParent());
} finally {
netlock.readLock().unlock();
}
}
/**
* Check if two nodes are on the same node group (hypervisor) The
* assumption here is: each nodes are leaf nodes.
*
* @param node1
* one node (can be null)
* @param node2
* another node (can be null)
* @return true if node1 and node2 are on the same node group; false
* otherwise
* @exception IllegalArgumentException
* when either node1 or node2 is null, or node1 or node2 do
* not belong to the cluster
*/
@Override
public boolean isOnSameNodeGroup(Node node1, Node node2) {
if (node1 == null || node2 == null) {
return false;
}
netlock.readLock().lock();
try {
return isSameParents(node1, node2);
} finally {
netlock.readLock().unlock();
}
}
/**
* Check if network topology is aware of NodeGroup
*/
@Override
public boolean isNodeGroupAware() {
return true;
}
/** Add a leaf node
* Update node counter & rack counter if necessary
* @param node node to be added; can be null
* @exception IllegalArgumentException if add a node to a leave
* or node to be added is not a leaf
*/
@Override
public void add(Node node) {
if (node==null) return;
if( node instanceof InnerNode ) {
throw new IllegalArgumentException(
"Not allow to add an inner node: "+NodeBase.getPath(node));
}
netlock.writeLock().lock();
try {
Node rack = null;
// if node only with default rack info, here we need to add default
// nodegroup info
if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) {
node.setNetworkLocation(node.getNetworkLocation() +
NetworkTopologyWithNodeGroup.DEFAULT_NODEGROUP);
}
Node nodeGroup = getNode(node.getNetworkLocation());
if (nodeGroup == null) {
nodeGroup = new InnerNodeWithNodeGroup(node.getNetworkLocation());
}
rack = getNode(nodeGroup.getNetworkLocation());
if (rack != null && !(rack instanceof InnerNode)) {
throw new IllegalArgumentException("Unexpected data node "
+ node.toString()
+ " at an illegal network location");
}
if (clusterMap.add(node)) {
LOG.info("Adding a new node: " + NodeBase.getPath(node));
if (rack == null) {
// We only track rack number here
numOfRacks++;
}
}
if(LOG.isDebugEnabled()) {
LOG.debug("NetworkTopology became:\n" + this.toString());
}
} finally {
netlock.writeLock().unlock();
}
}
/** Remove a node
* Update node counter and rack counter if necessary
* @param node node to be removed; can be null
*/
@Override
public void remove(Node node) {
if (node==null) return;
if( node instanceof InnerNode ) {
throw new IllegalArgumentException(
"Not allow to remove an inner node: "+NodeBase.getPath(node));
}
LOG.info("Removing a node: "+NodeBase.getPath(node));
netlock.writeLock().lock();
try {
if (clusterMap.remove(node)) {
Node nodeGroup = getNode(node.getNetworkLocation());
if (nodeGroup == null) {
nodeGroup = new InnerNode(node.getNetworkLocation());
}
InnerNode rack = (InnerNode)getNode(nodeGroup.getNetworkLocation());
if (rack == null) {
numOfRacks--;
}
}
if(LOG.isDebugEnabled()) {
LOG.debug("NetworkTopology became:\n" + this.toString());
}
} finally {
netlock.writeLock().unlock();
}
}
/** Sort nodes array by their distances to <i>reader</i>
* It linearly scans the array, if a local node is found, swap it with
* the first element of the array.
* If a local node group node is found, swap it with the first element
* following the local node.
* If a local rack node is found, swap it with the first element following
* the local node group node.
* If neither local node, node group node or local rack node is found, put a
* random replica location at position 0.
* It leaves the rest nodes untouched.
* @param reader the node that wishes to read a block from one of the nodes
* @param nodes the list of nodes containing data for the reader
*/
@Override
public void pseudoSortByDistance( Node reader, Node[] nodes ) {
if (reader != null && !this.contains(reader)) {
// if reader is not a datanode (not in NetworkTopology tree), we will
// replace this reader with a sibling leaf node in tree.
Node nodeGroup = getNode(reader.getNetworkLocation());
if (nodeGroup != null && nodeGroup instanceof InnerNode) {
InnerNode parentNode = (InnerNode) nodeGroup;
// replace reader with the first children of its parent in tree
reader = parentNode.getLeaf(0, null);
} else {
return;
}
}
int tempIndex = 0;
int localRackNode = -1;
int localNodeGroupNode = -1;
if (reader != null) {
//scan the array to find the local node & local rack node
for (int i = 0; i < nodes.length; i++) {
if (tempIndex == 0 && reader == nodes[i]) { //local node
//swap the local node and the node at position 0
if (i != 0) {
swap(nodes, tempIndex, i);
}
tempIndex=1;
if (localRackNode != -1 && (localNodeGroupNode !=-1)) {
if (localRackNode == 0) {
localRackNode = i;
}
if (localNodeGroupNode == 0) {
localNodeGroupNode = i;
}
break;
}
} else if (localNodeGroupNode == -1 && isOnSameNodeGroup(reader,
nodes[i])) {
//local node group
localNodeGroupNode = i;
// node local and rack local are already found
if(tempIndex != 0 && localRackNode != -1) break;
} else if (localRackNode == -1 && isOnSameRack(reader, nodes[i])) {
localRackNode = i;
if (tempIndex != 0 && localNodeGroupNode != -1) break;
}
}
// swap the local nodegroup node and the node at position tempIndex
if(localNodeGroupNode != -1 && localNodeGroupNode != tempIndex) {
swap(nodes, tempIndex, localNodeGroupNode);
if (localRackNode == tempIndex) {
localRackNode = localNodeGroupNode;
}
tempIndex++;
}
// swap the local rack node and the node at position tempIndex
if(localRackNode != -1 && localRackNode != tempIndex) {
swap(nodes, tempIndex, localRackNode);
tempIndex++;
}
}
// put a random node at position 0 if there is not a local/local-nodegroup/
// local-rack node
if (tempIndex == 0 && localNodeGroupNode == -1 && localRackNode == -1
&& nodes.length != 0) {
swap(nodes, 0, r.nextInt(nodes.length));
}
}
/** InnerNodeWithNodeGroup represents a switch/router of a data center, rack
* or physical host. Different from a leaf node, it has non-null children.
*/
static class InnerNodeWithNodeGroup extends InnerNode {
public InnerNodeWithNodeGroup(String name, String location,
InnerNode parent, int level) {
super(name, location, parent, level);
}
public InnerNodeWithNodeGroup(String name, String location) {
super(name, location);
}
public InnerNodeWithNodeGroup(String path) {
super(path);
}
@Override
boolean isRack() {
// it is node group
if (getChildren().isEmpty()) {
return false;
}
Node firstChild = children.get(0);
if (firstChild instanceof InnerNode) {
Node firstGrandChild = (((InnerNode) firstChild).children).get(0);
if (firstGrandChild instanceof InnerNode) {
// it is datacenter
return false;
} else {
return true;
}
}
return false;
}
/**
* Judge if this node represents a node group
*
* @return true if it has no child or its children are not InnerNodes
*/
boolean isNodeGroup() {
if (children.isEmpty()) {
return true;
}
Node firstChild = children.get(0);
if (firstChild instanceof InnerNode) {
// it is rack or datacenter
return false;
}
return true;
}
@Override
protected InnerNode createParentNode(String parentName) {
return new InnerNodeWithNodeGroup(parentName, getPath(this), this,
this.getLevel() + 1);
}
@Override
protected boolean areChildrenLeaves() {
return isNodeGroup();
}
}
}

View File

@ -599,10 +599,9 @@
</description>
</property>
<!-- Rack Configuration -->
<!-- Topology Configuration -->
<property>
<name>net.topology.node.switch.mapping.impl</name>
<name>net.topology.node.switch.mapping.impl</name>
<value>org.apache.hadoop.net.ScriptBasedMapping</value>
<description> The default implementation of the DNSToSwitchMapping. It
invokes a script specified in net.topology.script.file.name to resolve
@ -611,6 +610,13 @@
</description>
</property>
<property>
<name>net.topology.impl</name>
<value>org.apache.hadoop.net.NetworkTopology</value>
<description> The default implementation of NetworkTopology which is classic three layer one.
</description>
</property>
<property>
<name>net.topology.script.file.name</name>
<value></value>

View File

@ -0,0 +1,165 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.net;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
public class TestNetworkTopologyWithNodeGroup extends TestCase {
private final static NetworkTopologyWithNodeGroup cluster = new
NetworkTopologyWithNodeGroup();
private final static DatanodeDescriptor dataNodes[] = new DatanodeDescriptor[] {
DFSTestUtil.getDatanodeDescriptor("1.1.1.1", "/d1/r1/s1"),
DFSTestUtil.getDatanodeDescriptor("2.2.2.2", "/d1/r1/s1"),
DFSTestUtil.getDatanodeDescriptor("3.3.3.3", "/d1/r1/s2"),
DFSTestUtil.getDatanodeDescriptor("4.4.4.4", "/d1/r2/s3"),
DFSTestUtil.getDatanodeDescriptor("5.5.5.5", "/d1/r2/s3"),
DFSTestUtil.getDatanodeDescriptor("6.6.6.6", "/d1/r2/s4"),
DFSTestUtil.getDatanodeDescriptor("7.7.7.7", "/d2/r3/s5"),
DFSTestUtil.getDatanodeDescriptor("8.8.8.8", "/d2/r3/s6")
};
private final static NodeBase computeNode = new NodeBase("/d1/r1/s1/h9");
static {
for(int i=0; i<dataNodes.length; i++) {
cluster.add(dataNodes[i]);
}
}
public void testNumOfChildren() throws Exception {
assertEquals(cluster.getNumOfLeaves(), dataNodes.length);
}
public void testNumOfRacks() throws Exception {
assertEquals(cluster.getNumOfRacks(), 3);
}
public void testRacks() throws Exception {
assertEquals(cluster.getNumOfRacks(), 3);
assertTrue(cluster.isOnSameRack(dataNodes[0], dataNodes[1]));
assertTrue(cluster.isOnSameRack(dataNodes[1], dataNodes[2]));
assertFalse(cluster.isOnSameRack(dataNodes[2], dataNodes[3]));
assertTrue(cluster.isOnSameRack(dataNodes[3], dataNodes[4]));
assertTrue(cluster.isOnSameRack(dataNodes[4], dataNodes[5]));
assertFalse(cluster.isOnSameRack(dataNodes[5], dataNodes[6]));
assertTrue(cluster.isOnSameRack(dataNodes[6], dataNodes[7]));
}
public void testNodeGroups() throws Exception {
assertEquals(cluster.getNumOfRacks(), 3);
assertTrue(cluster.isOnSameNodeGroup(dataNodes[0], dataNodes[1]));
assertFalse(cluster.isOnSameNodeGroup(dataNodes[1], dataNodes[2]));
assertFalse(cluster.isOnSameNodeGroup(dataNodes[2], dataNodes[3]));
assertTrue(cluster.isOnSameNodeGroup(dataNodes[3], dataNodes[4]));
assertFalse(cluster.isOnSameNodeGroup(dataNodes[4], dataNodes[5]));
assertFalse(cluster.isOnSameNodeGroup(dataNodes[5], dataNodes[6]));
assertFalse(cluster.isOnSameNodeGroup(dataNodes[6], dataNodes[7]));
}
public void testGetDistance() throws Exception {
assertEquals(cluster.getDistance(dataNodes[0], dataNodes[0]), 0);
assertEquals(cluster.getDistance(dataNodes[0], dataNodes[1]), 2);
assertEquals(cluster.getDistance(dataNodes[0], dataNodes[2]), 4);
assertEquals(cluster.getDistance(dataNodes[0], dataNodes[3]), 6);
assertEquals(cluster.getDistance(dataNodes[0], dataNodes[6]), 8);
}
public void testPseudoSortByDistance() throws Exception {
DatanodeDescriptor[] testNodes = new DatanodeDescriptor[4];
// array contains both local node, local node group & local rack node
testNodes[0] = dataNodes[1];
testNodes[1] = dataNodes[2];
testNodes[2] = dataNodes[3];
testNodes[3] = dataNodes[0];
cluster.pseudoSortByDistance(dataNodes[0], testNodes );
assertTrue(testNodes[0] == dataNodes[0]);
assertTrue(testNodes[1] == dataNodes[1]);
assertTrue(testNodes[2] == dataNodes[2]);
assertTrue(testNodes[3] == dataNodes[3]);
// array contains local node & local node group
testNodes[0] = dataNodes[3];
testNodes[1] = dataNodes[4];
testNodes[2] = dataNodes[1];
testNodes[3] = dataNodes[0];
cluster.pseudoSortByDistance(dataNodes[0], testNodes );
assertTrue(testNodes[0] == dataNodes[0]);
assertTrue(testNodes[1] == dataNodes[1]);
// array contains local node & rack node
testNodes[0] = dataNodes[5];
testNodes[1] = dataNodes[3];
testNodes[2] = dataNodes[2];
testNodes[3] = dataNodes[0];
cluster.pseudoSortByDistance(dataNodes[0], testNodes );
assertTrue(testNodes[0] == dataNodes[0]);
assertTrue(testNodes[1] == dataNodes[2]);
// array contains local-nodegroup node (not a data node also) & rack node
testNodes[0] = dataNodes[6];
testNodes[1] = dataNodes[7];
testNodes[2] = dataNodes[2];
testNodes[3] = dataNodes[0];
cluster.pseudoSortByDistance(computeNode, testNodes );
assertTrue(testNodes[0] == dataNodes[0]);
assertTrue(testNodes[1] == dataNodes[2]);
}
/**
* This picks a large number of nodes at random in order to ensure coverage
*
* @param numNodes the number of nodes
* @param excludedScope the excluded scope
* @return the frequency that nodes were chosen
*/
private Map<Node, Integer> pickNodesAtRandom(int numNodes,
String excludedScope) {
Map<Node, Integer> frequency = new HashMap<Node, Integer>();
for (DatanodeDescriptor dnd : dataNodes) {
frequency.put(dnd, 0);
}
for (int j = 0; j < numNodes; j++) {
Node random = cluster.chooseRandom(excludedScope);
frequency.put(random, frequency.get(random) + 1);
}
return frequency;
}
/**
* This test checks that chooseRandom works for an excluded node.
*/
public void testChooseRandomExcludedNode() {
String scope = "~" + NodeBase.getPath(dataNodes[0]);
Map<Node, Integer> frequency = pickNodesAtRandom(100, scope);
for (Node key : dataNodes) {
// all nodes except the first should be more than zero
assertTrue(frequency.get(key) > 0 || key == dataNodes[0]);
}
}
}