From bb87b70e37be5cae399645b559f93c44c439c243 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Tue, 3 Jun 2014 22:06:11 +0000 Subject: [PATCH] Revert bad HDFS-6268 commit from branch-2 git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1599807 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/hadoop/net/NetworkTopology.java | 145 +++++++----------- .../net/NetworkTopologyWithNodeGroup.java | 105 +++++++++---- .../net/TestNetworkTopologyWithNodeGroup.java | 10 +- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 - .../blockmanagement/DatanodeManager.java | 3 +- .../hdfs/server/namenode/FSNamesystem.java | 4 +- .../org/apache/hadoop/hdfs/TestGetBlocks.java | 3 - .../snapshot/TestSnapshotFileLength.java | 18 +-- .../hadoop/net/TestNetworkTopology.java | 46 +----- 9 files changed, 148 insertions(+), 189 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java index 9ba0b43d985..daf7dc735e4 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java @@ -20,10 +20,8 @@ package org.apache.hadoop.net; import java.util.ArrayList; import java.util.List; import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Random; -import java.util.TreeMap; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -35,9 +33,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.util.ReflectionUtils; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - /** The class represents a cluster of computer with a tree hierarchical * network topology. * For example, a cluster may be consists of many data centers filled @@ -673,23 +668,7 @@ public class NetworkTopology { return node1.getParent()==node2.getParent(); } - private static final ThreadLocal r = new ThreadLocal(); - - /** - * Getter for thread-local Random, which provides better performance than - * a shared Random (even though Random is thread-safe). - * - * @return Thread-local Random. - */ - protected Random getRandom() { - Random rand = r.get(); - if (rand == null) { - rand = new Random(); - r.set(rand); - } - return rand; - } - + final protected static Random r = new Random(); /** randomly choose one node from scope * if scope starts with ~, choose one from the all nodes except for the * ones in scope; otherwise, choose one from scope @@ -739,7 +718,7 @@ public class NetworkTopology { "Failed to find datanode (scope=\"" + String.valueOf(scope) + "\" excludedScope=\"" + String.valueOf(excludedScope) + "\")."); } - int leaveIndex = getRandom().nextInt(numOfDatanodes); + int leaveIndex = r.nextInt(numOfDatanodes); return innerNode.getLeaf(leaveIndex, node); } @@ -846,79 +825,61 @@ public class NetworkTopology { return networkLocation.substring(index); } - /** - * Returns an integer weight which specifies how far away {node} is away from - * {reader}. A lower value signifies that a node is closer. - * - * @param reader Node where data will be read - * @param node Replica of data - * @return weight - */ - protected int getWeight(Node reader, Node node) { - // 0 is local, 1 is same rack, 2 is off rack - // Start off by initializing to off rack - int weight = 2; - if (reader != null) { - if (reader == node) { - weight = 0; - } else if (isOnSameRack(reader, node)) { - weight = 1; - } - } - return weight; + /** swap two array items */ + static protected void swap(Node[] nodes, int i, int j) { + Node tempNode; + tempNode = nodes[j]; + nodes[j] = nodes[i]; + nodes[i] = tempNode; } - - /** - * Sort nodes array by network distance to reader. - *

- * In a three-level topology, a node can be either local, on the same rack, or - * on a different rack from the reader. Sorting the nodes based on network - * distance from the reader reduces network traffic and improves performance. - *

- * As an additional twist, we also randomize the nodes at each network - * distance using the provided random seed. This helps with load balancing - * when there is data skew. - * - * @param reader Node where data will be read - * @param nodes Available replicas with the requested data - * @param seed Used to seed the pseudo-random generator that randomizes the - * set of nodes at each network distance. + + /** Sort nodes array by their distances to reader + * It linearly scans the array, if a local node is found, swap it with + * the first element of the array. + * If a local rack node is found, swap it with the first element following + * the local node. + * If neither local node or local rack node is found, put a random replica + * location at position 0. + * It leaves the rest nodes untouched. + * @param reader the node that wishes to read a block from one of the nodes + * @param nodes the list of nodes containing data for the reader */ - public void sortByDistance(Node reader, Node[] nodes, long seed) { - /** Sort weights for the nodes array */ - int[] weights = new int[nodes.length]; - for (int i=0; i> tree = new TreeMap>(); - for (int i=0; i list = tree.get(weight); - if (list == null) { - list = Lists.newArrayListWithExpectedSize(1); - tree.put(weight, list); - } - list.add(node); - } - - // Seed is normally the block id - // This means we use the same pseudo-random order for each block, for - // potentially better page cache usage. - Random rand = getRandom(); - rand.setSeed(seed); - int idx = 0; - for (List list: tree.values()) { - if (list != null) { - Collections.shuffle(list, rand); - for (Node n: list) { - nodes[idx] = n; - idx++; + public void pseudoSortByDistance( Node reader, Node[] nodes ) { + int tempIndex = 0; + int localRackNode = -1; + if (reader != null ) { + //scan the array to find the local node & local rack node + for(int i=0; ireader. - *

- * This is the same as - * {@link NetworkTopology#sortByDistance(Node, Node[], long)} except with a - * four-level network topology which contains the additional network distance - * of a "node group" which is between local and same rack. - * - * @param reader Node where data will be read - * @param nodes Available replicas with the requested data - * @param seed Used to seed the pseudo-random generator that randomizes the - * set of nodes at each network distance. + /** Sort nodes array by their distances to reader + * It linearly scans the array, if a local node is found, swap it with + * the first element of the array. + * If a local node group node is found, swap it with the first element + * following the local node. + * If a local rack node is found, swap it with the first element following + * the local node group node. + * If neither local node, node group node or local rack node is found, put a + * random replica location at position 0. + * It leaves the rest nodes untouched. + * @param reader the node that wishes to read a block from one of the nodes + * @param nodes the list of nodes containing data for the reader */ @Override - public void sortByDistance( Node reader, Node[] nodes, long seed) { - // If reader is not a datanode (not in NetworkTopology tree), we need to - // replace this reader with a sibling leaf node in tree. + public void pseudoSortByDistance( Node reader, Node[] nodes ) { + if (reader != null && !this.contains(reader)) { + // if reader is not a datanode (not in NetworkTopology tree), we will + // replace this reader with a sibling leaf node in tree. Node nodeGroup = getNode(reader.getNetworkLocation()); if (nodeGroup != null && nodeGroup instanceof InnerNode) { InnerNode parentNode = (InnerNode) nodeGroup; @@ -292,7 +276,62 @@ public class NetworkTopologyWithNodeGroup extends NetworkTopology { return; } } - super.sortByDistance(reader, nodes, seed); + int tempIndex = 0; + int localRackNode = -1; + int localNodeGroupNode = -1; + if (reader != null) { + //scan the array to find the local node & local rack node + for (int i = 0; i < nodes.length; i++) { + if (tempIndex == 0 && reader == nodes[i]) { //local node + //swap the local node and the node at position 0 + if (i != 0) { + swap(nodes, tempIndex, i); + } + tempIndex=1; + + if (localRackNode != -1 && (localNodeGroupNode !=-1)) { + if (localRackNode == 0) { + localRackNode = i; + } + if (localNodeGroupNode == 0) { + localNodeGroupNode = i; + } + break; + } + } else if (localNodeGroupNode == -1 && isOnSameNodeGroup(reader, + nodes[i])) { + //local node group + localNodeGroupNode = i; + // node local and rack local are already found + if(tempIndex != 0 && localRackNode != -1) break; + } else if (localRackNode == -1 && isOnSameRack(reader, nodes[i])) { + localRackNode = i; + if (tempIndex != 0 && localNodeGroupNode != -1) break; + } + } + + // swap the local nodegroup node and the node at position tempIndex + if(localNodeGroupNode != -1 && localNodeGroupNode != tempIndex) { + swap(nodes, tempIndex, localNodeGroupNode); + if (localRackNode == tempIndex) { + localRackNode = localNodeGroupNode; + } + tempIndex++; + } + + // swap the local rack node and the node at position tempIndex + if(localRackNode != -1 && localRackNode != tempIndex) { + swap(nodes, tempIndex, localRackNode); + tempIndex++; + } + } + + // put a random node at position 0 if there is not a local/local-nodegroup/ + // local-rack node + if (tempIndex == 0 && localNodeGroupNode == -1 && localRackNode == -1 + && nodes.length != 0) { + swap(nodes, 0, r.nextInt(nodes.length)); + } } /** InnerNodeWithNodeGroup represents a switch/router of a data center, rack diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java index c082cce14a4..5fa2e14748b 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java @@ -96,7 +96,7 @@ public class TestNetworkTopologyWithNodeGroup { } @Test - public void testSortByDistance() throws Exception { + public void testPseudoSortByDistance() throws Exception { NodeBase[] testNodes = new NodeBase[4]; // array contains both local node, local node group & local rack node @@ -104,7 +104,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[2]; testNodes[2] = dataNodes[3]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); assertTrue(testNodes[2] == dataNodes[2]); @@ -115,7 +115,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[4]; testNodes[2] = dataNodes[1]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); @@ -124,7 +124,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[3]; testNodes[2] = dataNodes[2]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[2]); @@ -133,7 +133,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[7]; testNodes[2] = dataNodes[2]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(computeNode, testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(computeNode, testNodes ); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[2]); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 6dc2e39739a..71b48c81fd7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -147,9 +147,6 @@ Release 2.5.0 - UNRELEASED HDFS-6109 let sync_file_range() system call run in background (Liang Xie via stack) - HDFS-6268. Better sorting in NetworkTopology#pseudoSortByDistance when - no local node is found. (wang) - OPTIMIZATIONS HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index b9f20906994..bcc8b50dadf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -351,8 +351,7 @@ public class DatanodeManager { DFSUtil.DECOM_COMPARATOR; for (LocatedBlock b : locatedblocks) { - networktopology.sortByDistance(client, b.getLocations(), b - .getBlock().getBlockId()); + networktopology.pseudoSortByDistance(client, b.getLocations()); // Move decommissioned/stale datanodes to the bottom Arrays.sort(b.getLocations(), comparator); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 7676f53a742..fbde862789a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -1618,11 +1618,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats, blockManager.getDatanodeManager().sortLocatedBlocks( clientMachine, blocks.getLocatedBlocks()); - // lastBlock is not part of getLocatedBlocks(), might need to sort it too LocatedBlock lastBlock = blocks.getLastLocatedBlock(); if (lastBlock != null) { - ArrayList lastBlockList = - Lists.newArrayListWithCapacity(1); + ArrayList lastBlockList = new ArrayList(); lastBlockList.add(lastBlock); blockManager.getDatanodeManager().sortLocatedBlocks( clientMachine, lastBlockList); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java index d64c50c5ffe..3b21e73c956 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java @@ -169,9 +169,6 @@ public class TestGetBlocks { if (stm != null) { stm.close(); } - if (client != null) { - client.close(); - } cluster.shutdown(); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestSnapshotFileLength.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestSnapshotFileLength.java index 32534f05264..7c1d6352eaa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestSnapshotFileLength.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestSnapshotFileLength.java @@ -143,10 +143,10 @@ public class TestSnapshotFileLength { // Make sure we can read the entire file via its non-snapshot path. fileStatus = hdfs.getFileStatus(file1); - assertEquals("Unexpected file length", BLOCKSIZE * 2, fileStatus.getLen()); + assertEquals(fileStatus.getLen(), BLOCKSIZE * 2); fis = hdfs.open(file1); bytesRead = fis.read(buffer, 0, buffer.length); - assertEquals("Unexpected # bytes read", BLOCKSIZE * 2, bytesRead); + assertEquals(bytesRead, BLOCKSIZE * 2); fis.close(); Path file1snap1 = @@ -156,23 +156,21 @@ public class TestSnapshotFileLength { assertEquals(fileStatus.getLen(), BLOCKSIZE); // Make sure we can only read up to the snapshot length. bytesRead = fis.read(buffer, 0, buffer.length); - assertEquals("Unexpected # bytes read", BLOCKSIZE, bytesRead); + assertEquals(bytesRead, BLOCKSIZE); fis.close(); - PrintStream outBackup = System.out; - PrintStream errBackup = System.err; + PrintStream psBackup = System.out; ByteArrayOutputStream bao = new ByteArrayOutputStream(); System.setOut(new PrintStream(bao)); System.setErr(new PrintStream(bao)); // Make sure we can cat the file upto to snapshot length FsShell shell = new FsShell(); - try { + try{ ToolRunner.run(conf, shell, new String[] { "-cat", "/TestSnapshotFileLength/sub1/.snapshot/snapshot1/file1" }); - assertEquals("Unexpected # bytes from -cat", BLOCKSIZE, bao.size()); - } finally { - System.setOut(outBackup); - System.setErr(errBackup); + assertEquals(bao.size(), BLOCKSIZE); + }finally{ + System.setOut(psBackup); } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java index 4f98ae4a221..154014180a4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java @@ -54,8 +54,7 @@ public class TestNetworkTopology { DFSTestUtil.getDatanodeDescriptor("4.4.4.4", "/d1/r2"), DFSTestUtil.getDatanodeDescriptor("5.5.5.5", "/d1/r2"), DFSTestUtil.getDatanodeDescriptor("6.6.6.6", "/d2/r3"), - DFSTestUtil.getDatanodeDescriptor("7.7.7.7", "/d2/r3"), - DFSTestUtil.getDatanodeDescriptor("8.8.8.8", "/d2/r3") + DFSTestUtil.getDatanodeDescriptor("7.7.7.7", "/d2/r3") }; for (int i = 0; i < dataNodes.length; i++) { cluster.add(dataNodes[i]); @@ -118,14 +117,14 @@ public class TestNetworkTopology { } @Test - public void testSortByDistance() throws Exception { + public void testPseudoSortByDistance() throws Exception { DatanodeDescriptor[] testNodes = new DatanodeDescriptor[3]; // array contains both local node & local rack node testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[2]; testNodes[2] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); assertTrue(testNodes[2] == dataNodes[2]); @@ -134,7 +133,7 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[3]; testNodes[2] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); assertTrue(testNodes[2] == dataNodes[3]); @@ -143,50 +142,21 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[5]; testNodes[1] = dataNodes[3]; testNodes[2] = dataNodes[1]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); assertTrue(testNodes[0] == dataNodes[1]); assertTrue(testNodes[1] == dataNodes[3]); assertTrue(testNodes[2] == dataNodes[5]); - + // array contains local rack node which happens to be in position 0 testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[5]; testNodes[2] = dataNodes[3]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEADBEEF); - assertTrue(testNodes[0] == dataNodes[1]); - assertTrue(testNodes[1] == dataNodes[3]); - assertTrue(testNodes[2] == dataNodes[5]); - - // Same as previous, but with a different random seed to test randomization - testNodes[0] = dataNodes[1]; - testNodes[1] = dataNodes[5]; - testNodes[2] = dataNodes[3]; - cluster.sortByDistance(dataNodes[0], testNodes, 0xDEAD); - // sortByDistance does not take the "data center" layer into consideration + cluster.pseudoSortByDistance(dataNodes[0], testNodes ); + // peudoSortByDistance does not take the "data center" layer into consideration // and it doesn't sort by getDistance, so 1, 5, 3 is also valid here assertTrue(testNodes[0] == dataNodes[1]); assertTrue(testNodes[1] == dataNodes[5]); assertTrue(testNodes[2] == dataNodes[3]); - - // Array is just local rack nodes - // Expect a random first node depending on the seed (normally the block ID). - DatanodeDescriptor first = null; - boolean foundRandom = false; - for (int i=5; i<=7; i++) { - testNodes[0] = dataNodes[5]; - testNodes[1] = dataNodes[6]; - testNodes[2] = dataNodes[7]; - cluster.sortByDistance(dataNodes[i], testNodes, 0xBEADED+i); - if (first == null) { - first = testNodes[0]; - } else { - if (first != testNodes[0]) { - foundRandom = true; - break; - } - } - } - assertTrue("Expected to find a different first location", foundRandom); } @Test