diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java index 6ee6db769ad..50d511611ec 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java @@ -26,6 +26,7 @@ import java.util.TreeMap; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -689,6 +690,12 @@ public class NetworkTopology { return rand; } + @VisibleForTesting + void setRandomSeed(long seed) { + Random rand = getRandom(); + rand.setSeed(seed); + } + /** randomly choose one node from scope * if scope starts with ~, choose one from the all nodes except for the * ones in scope; otherwise, choose one from scope @@ -870,21 +877,19 @@ public class NetworkTopology { /** * Sort nodes array by network distance to reader. *

- * In a three-level topology, a node can be either local, on the same rack, or - * on a different rack from the reader. Sorting the nodes based on network - * distance from the reader reduces network traffic and improves performance. + * In a three-level topology, a node can be either local, on the same rack, + * or on a different rack from the reader. Sorting the nodes based on network + * distance from the reader reduces network traffic and improves + * performance. *

* As an additional twist, we also randomize the nodes at each network - * distance using the provided random seed. This helps with load balancing - * when there is data skew. - * - * @param reader Node where data will be read - * @param nodes Available replicas with the requested data - * @param seed Used to seed the pseudo-random generator that randomizes the - * set of nodes at each network distance. + * distance. This helps with load balancing when there is data skew. + * + * @param reader Node where data will be read + * @param nodes Available replicas with the requested data + * @param activeLen Number of active nodes at the front of the array */ - public void sortByDistance(Node reader, Node[] nodes, int activeLen, - long seed, boolean randomizeBlockLocationsPerBlock) { + public void sortByDistance(Node reader, Node[] nodes, int activeLen) { /** Sort weights for the nodes array */ int[] weights = new int[activeLen]; for (int i=0; i list: tree.values()) { if (list != null) { diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java index cc598c0986f..13160ebba06 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java @@ -268,19 +268,17 @@ public class NetworkTopologyWithNodeGroup extends NetworkTopology { /** * Sort nodes array by their distances to reader. *

- * This is the same as - * {@link NetworkTopology#sortByDistance(Node, Node[], long)} except with a - * four-level network topology which contains the additional network distance - * of a "node group" which is between local and same rack. - * - * @param reader Node where data will be read - * @param nodes Available replicas with the requested data - * @param seed Used to seed the pseudo-random generator that randomizes the - * set of nodes at each network distance. + * This is the same as {@link NetworkTopology#sortByDistance(Node, Node[], + * int)} except with a four-level network topology which contains the + * additional network distance of a "node group" which is between local and + * same rack. + * + * @param reader Node where data will be read + * @param nodes Available replicas with the requested data + * @param activeLen Number of active nodes at the front of the array */ @Override - public void sortByDistance(Node reader, Node[] nodes, int activeLen, - long seed, boolean randomizeBlockLocationsPerBlock) { + public void sortByDistance(Node reader, Node[] nodes, int activeLen) { // If reader is not a datanode (not in NetworkTopology tree), we need to // replace this reader with a sibling leaf node in tree. if (reader != null && !this.contains(reader)) { @@ -293,8 +291,7 @@ public class NetworkTopologyWithNodeGroup extends NetworkTopology { return; } } - super.sortByDistance(reader, nodes, activeLen, seed, - randomizeBlockLocationsPerBlock); + super.sortByDistance(reader, nodes, activeLen); } /** InnerNodeWithNodeGroup represents a switch/router of a data center, rack diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java index 657fae3f526..15bd9fe4924 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java @@ -104,8 +104,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[2]; testNodes[2] = dataNodes[3]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); assertTrue(testNodes[2] == dataNodes[2]); @@ -116,8 +115,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[4]; testNodes[2] = dataNodes[1]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); @@ -126,8 +124,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[3]; testNodes[2] = dataNodes[2]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[2]); @@ -136,8 +133,7 @@ public class TestNetworkTopologyWithNodeGroup { testNodes[1] = dataNodes[7]; testNodes[2] = dataNodes[2]; testNodes[3] = dataNodes[0]; - cluster.sortByDistance(computeNode, testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.sortByDistance(computeNode, testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[2]); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index c7339f080b9..436d2f0b318 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -771,6 +771,9 @@ Release 2.6.0 - UNRELEASED HDFS-7078. Fix listEZs to work correctly with snapshots. (wang) + HDFS-6840. Clients are always sent to the same datanode when read + is off rack. (wang) + BREAKDOWN OF HDFS-6134 AND HADOOP-10150 SUBTASKS AND RELATED JIRAS HDFS-6387. HDFS CLI admin tool for creating & deleting an diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 7f04fc2b677..609b4c6025f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -221,9 +221,6 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final String DFS_NAMENODE_MIN_SUPPORTED_DATANODE_VERSION_KEY = "dfs.namenode.min.supported.datanode.version"; public static final String DFS_NAMENODE_MIN_SUPPORTED_DATANODE_VERSION_DEFAULT = "3.0.0-SNAPSHOT"; - public static final String DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK = "dfs.namenode.randomize-block-locations-per-block"; - public static final boolean DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK_DEFAULT = false; - public static final String DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY = "dfs.namenode.edits.dir.minimum"; public static final int DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT = 1; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index 709f060d237..5314f09a3fd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -348,8 +348,7 @@ public class DatanodeManager { /** Sort the located blocks by the distance to the target host. */ public void sortLocatedBlocks(final String targethost, - final List locatedblocks, - boolean randomizeBlockLocationsPerBlock) { + final List locatedblocks) { //sort the blocks // As it is possible for the separation of node manager and datanode, // here we should get node but not datanode only . @@ -376,8 +375,7 @@ public class DatanodeManager { --lastActiveIndex; } int activeLen = lastActiveIndex + 1; - networktopology.sortByDistance(client, b.getLocations(), activeLen, b - .getBlock().getBlockId(), randomizeBlockLocationsPerBlock); + networktopology.sortByDistance(client, b.getLocations(), activeLen); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index d0a1af65add..2b033628d04 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -65,8 +65,6 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CAC import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY; @@ -547,8 +545,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats, private final FSImage fsImage; - private boolean randomizeBlockLocationsPerBlock; - /** * Notify that loading of this FSDirectory is complete, and * it is imageLoaded for use @@ -865,10 +861,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats, DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT); - this.randomizeBlockLocationsPerBlock = conf.getBoolean( - DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK, - DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK_DEFAULT); - this.dtSecretManager = createDelegationTokenSecretManager(conf); this.dir = new FSDirectory(this, conf); this.snapshotManager = new SnapshotManager(dir); @@ -1739,7 +1731,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, true); if (blocks != null) { blockManager.getDatanodeManager().sortLocatedBlocks(clientMachine, - blocks.getLocatedBlocks(), randomizeBlockLocationsPerBlock); + blocks.getLocatedBlocks()); // lastBlock is not part of getLocatedBlocks(), might need to sort it too LocatedBlock lastBlock = blocks.getLastLocatedBlock(); @@ -1748,7 +1740,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, Lists.newArrayListWithCapacity(1); lastBlockList.add(lastBlock); blockManager.getDatanodeManager().sortLocatedBlocks(clientMachine, - lastBlockList, randomizeBlockLocationsPerBlock); + lastBlockList); } } return blocks; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index b28b2169755..d404c1c1e0a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -2069,19 +2069,6 @@ - - dfs.namenode.randomize-block-locations-per-block - false - When fetching replica locations of a block, the replicas - are sorted based on network distance. This configuration parameter - determines whether the replicas at the same network distance are randomly - shuffled. By default, this is false, such that repeated requests for a block's - replicas always result in the same order. This potentially improves page cache - behavior. However, for some network topologies, it is desirable to shuffle this - order for better load balancing. - - - dfs.datanode.block.id.layout.upgrade.threads 12 diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java index faf946004ac..1758807b84a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopology.java @@ -139,8 +139,8 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[2]; testNodes[2] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.setRandomSeed(0xDEADBEEF); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); assertTrue(testNodes[2] == dataNodes[2]); @@ -152,8 +152,8 @@ public class TestNetworkTopology { dtestNodes[2] = dataNodes[11]; dtestNodes[3] = dataNodes[9]; dtestNodes[4] = dataNodes[10]; - cluster.sortByDistance(dataNodes[8], dtestNodes, - dtestNodes.length - 2, 0xDEADBEEF, false); + cluster.setRandomSeed(0xDEADBEEF); + cluster.sortByDistance(dataNodes[8], dtestNodes, dtestNodes.length - 2); assertTrue(dtestNodes[0] == dataNodes[8]); assertTrue(dtestNodes[1] == dataNodes[11]); assertTrue(dtestNodes[2] == dataNodes[12]); @@ -164,8 +164,8 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[3]; testNodes[2] = dataNodes[0]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.setRandomSeed(0xDEADBEEF); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[0]); assertTrue(testNodes[1] == dataNodes[1]); assertTrue(testNodes[2] == dataNodes[3]); @@ -174,8 +174,8 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[5]; testNodes[1] = dataNodes[3]; testNodes[2] = dataNodes[1]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.setRandomSeed(0xDEADBEEF); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[1]); assertTrue(testNodes[1] == dataNodes[3]); assertTrue(testNodes[2] == dataNodes[5]); @@ -184,8 +184,8 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[5]; testNodes[2] = dataNodes[3]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEADBEEF, false); + cluster.setRandomSeed(0xDEADBEEF); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); assertTrue(testNodes[0] == dataNodes[1]); assertTrue(testNodes[1] == dataNodes[3]); assertTrue(testNodes[2] == dataNodes[5]); @@ -194,24 +194,23 @@ public class TestNetworkTopology { testNodes[0] = dataNodes[1]; testNodes[1] = dataNodes[5]; testNodes[2] = dataNodes[3]; - cluster.sortByDistance(dataNodes[0], testNodes, - testNodes.length, 0xDEAD, false); + cluster.setRandomSeed(0xDEAD); + cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length); // sortByDistance does not take the "data center" layer into consideration // and it doesn't sort by getDistance, so 1, 5, 3 is also valid here assertTrue(testNodes[0] == dataNodes[1]); assertTrue(testNodes[1] == dataNodes[5]); assertTrue(testNodes[2] == dataNodes[3]); - // Array is just local rack nodes - // Expect a random first node depending on the seed (normally the block ID). + // Array of just rack-local nodes + // Expect a random first node DatanodeDescriptor first = null; boolean foundRandom = false; for (int i=5; i<=7; i++) { testNodes[0] = dataNodes[5]; testNodes[1] = dataNodes[6]; testNodes[2] = dataNodes[7]; - cluster.sortByDistance(dataNodes[i], testNodes, - testNodes.length, 0xBEADED+i, false); + cluster.sortByDistance(dataNodes[i], testNodes, testNodes.length); if (first == null) { first = testNodes[0]; } else { @@ -222,16 +221,15 @@ public class TestNetworkTopology { } } assertTrue("Expected to find a different first location", foundRandom); - // Array of rack local nodes with randomizeBlockLocationsPerBlock set to - // true - // Expect random order of block locations for same block + + // Array of just remote nodes + // Expect random first node first = null; for (int i = 1; i <= 4; i++) { testNodes[0] = dataNodes[13]; testNodes[1] = dataNodes[14]; testNodes[2] = dataNodes[15]; - cluster.sortByDistance(dataNodes[15 + i], testNodes, testNodes.length, - 0xBEADED, true); + cluster.sortByDistance(dataNodes[i], testNodes, testNodes.length); if (first == null) { first = testNodes[0]; } else {