MAPREDUCE-4892. Modify CombineFileInputFormat to not skew input slits' allocation on small clusters. Contributed by Bikas Saha.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1450912 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2013-02-27 18:49:37 +00:00
parent 0057600a82
commit 0b9ed2364a
3 changed files with 223 additions and 111 deletions

View File

@ -166,6 +166,9 @@ Release 2.0.4-beta - UNRELEASED
MAPREDUCE-5033. mapred shell script should respect usage flags MAPREDUCE-5033. mapred shell script should respect usage flags
(--help -help -h). (Andrew Wang via atm) (--help -help -h). (Andrew Wang via atm)
MAPREDUCE-4892. Modify CombineFileInputFormat to not skew input slits'
allocation on small clusters. (Bikas Saha via vinodkv)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES

View File

@ -49,6 +49,8 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.NetworkTopology;
import com.google.common.annotations.VisibleForTesting;
/** /**
* An abstract {@link InputFormat} that returns {@link CombineFileSplit}'s in * An abstract {@link InputFormat} that returns {@link CombineFileSplit}'s in
* {@link InputFormat#getSplits(JobContext)} method. * {@link InputFormat#getSplits(JobContext)} method.
@ -76,7 +78,7 @@ import org.apache.hadoop.net.NetworkTopology;
@InterfaceStability.Stable @InterfaceStability.Stable
public abstract class CombineFileInputFormat<K, V> public abstract class CombineFileInputFormat<K, V>
extends FileInputFormat<K, V> { extends FileInputFormat<K, V> {
public static final String SPLIT_MINSIZE_PERNODE = public static final String SPLIT_MINSIZE_PERNODE =
"mapreduce.input.fileinputformat.split.minsize.per.node"; "mapreduce.input.fileinputformat.split.minsize.per.node";
public static final String SPLIT_MINSIZE_PERRACK = public static final String SPLIT_MINSIZE_PERRACK =
@ -163,7 +165,6 @@ public abstract class CombineFileInputFormat<K, V>
@Override @Override
public List<InputSplit> getSplits(JobContext job) public List<InputSplit> getSplits(JobContext job)
throws IOException { throws IOException {
long minSizeNode = 0; long minSizeNode = 0;
long minSizeRack = 0; long minSizeRack = 0;
long maxSize = 0; long maxSize = 0;
@ -286,56 +287,100 @@ public abstract class CombineFileInputFormat<K, V>
rackToNodes, maxSize); rackToNodes, maxSize);
totLength += files[i].getLength(); totLength += files[i].getLength();
} }
createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength,
maxSize, minSizeNode, minSizeRack, splits);
}
@VisibleForTesting
void createSplits(HashMap<String, List<OneBlockInfo>> nodeToBlocks,
HashMap<OneBlockInfo, String[]> blockToNodes,
HashMap<String, List<OneBlockInfo>> rackToBlocks,
long totLength,
long maxSize,
long minSizeNode,
long minSizeRack,
List<InputSplit> splits
) {
ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
Set<String> nodes = new HashSet<String>(); Set<String> nodes = new HashSet<String>();
long curSplitSize = 0; long curSplitSize = 0;
int numNodes = nodeToBlocks.size();
long totalLength = totLength;
// process all nodes and create splits that are local while(true) {
// to a node. // it is allowed for maxSize to be 0. Disable smoothing load for such cases
for (Iterator<Map.Entry<String, int avgSplitsPerNode = maxSize > 0 && numNodes > 0 ?
List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); ((int) (totalLength/maxSize))/numNodes
iter.hasNext();) { : Integer.MAX_VALUE;
int maxSplitsByNodeOnly = (avgSplitsPerNode > 0) ? avgSplitsPerNode : 1;
numNodes = 0;
Map.Entry<String, List<OneBlockInfo>> one = iter.next(); // process all nodes and create splits that are local to a node.
nodes.add(one.getKey()); for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks
List<OneBlockInfo> blocksInNode = one.getValue(); .entrySet().iterator(); iter.hasNext();) {
Map.Entry<String, List<OneBlockInfo>> one = iter.next();
nodes.add(one.getKey());
List<OneBlockInfo> blocksInNode = one.getValue();
// for each block, copy it into validBlocks. Delete it from // for each block, copy it into validBlocks. Delete it from
// blockToNodes so that the same block does not appear in // blockToNodes so that the same block does not appear in
// two different splits. // two different splits.
for (OneBlockInfo oneblock : blocksInNode) { int splitsInNode = 0;
if (blockToNodes.containsKey(oneblock)) { for (OneBlockInfo oneblock : blocksInNode) {
validBlocks.add(oneblock); if (blockToNodes.containsKey(oneblock)) {
blockToNodes.remove(oneblock); validBlocks.add(oneblock);
curSplitSize += oneblock.length; blockToNodes.remove(oneblock);
curSplitSize += oneblock.length;
// if the accumulated split size exceeds the maximum, then // if the accumulated split size exceeds the maximum, then
// create this split. // create this split.
if (maxSize != 0 && curSplitSize >= maxSize) { if (maxSize != 0 && curSplitSize >= maxSize) {
// create an input split and add it to the splits array // create an input split and add it to the splits array
addCreatedSplit(splits, nodes, validBlocks); addCreatedSplit(splits, nodes, validBlocks);
curSplitSize = 0; totalLength -= curSplitSize;
validBlocks.clear(); curSplitSize = 0;
validBlocks.clear();
splitsInNode++;
if (splitsInNode == maxSplitsByNodeOnly) {
// stop grouping on a node so as not to create
// disproportionately more splits on a node because it happens
// to have many blocks
// consider only these nodes in next round of grouping because
// they have leftover blocks that may need to be grouped
numNodes++;
break;
}
}
} }
} }
} // if there were any blocks left over and their combined size is
// if there were any blocks left over and their combined size is // larger than minSplitNode, then combine them into one split.
// larger than minSplitNode, then combine them into one split. // Otherwise add them back to the unprocessed pool. It is likely
// Otherwise add them back to the unprocessed pool. It is likely // that they will be combined with other blocks from the
// that they will be combined with other blocks from the // same rack later on.
// same rack later on. if (minSizeNode != 0 && curSplitSize >= minSizeNode
if (minSizeNode != 0 && curSplitSize >= minSizeNode) { && splitsInNode == 0) {
// create an input split and add it to the splits array // haven't created any split on this machine. so its ok to add a
addCreatedSplit(splits, nodes, validBlocks); // smaller
} else { // one for parallelism. Otherwise group it in the rack for balanced
for (OneBlockInfo oneblock : validBlocks) { // size
blockToNodes.put(oneblock, oneblock.hosts); // create an input split and add it to the splits array
addCreatedSplit(splits, nodes, validBlocks);
totalLength -= curSplitSize;
} else {
for (OneBlockInfo oneblock : validBlocks) {
blockToNodes.put(oneblock, oneblock.hosts);
}
} }
validBlocks.clear();
nodes.clear();
curSplitSize = 0;
}
if(!(numNodes>0 && totalLength>0)) {
break;
} }
validBlocks.clear();
nodes.clear();
curSplitSize = 0;
} }
// if blocks in a rack are below the specified minimum size, then keep them // if blocks in a rack are below the specified minimum size, then keep them
@ -458,7 +503,6 @@ public abstract class CombineFileInputFormat<K, V>
offset[i] = validBlocks.get(i).offset; offset[i] = validBlocks.get(i).offset;
length[i] = validBlocks.get(i).length; length[i] = validBlocks.get(i).length;
} }
// add this split to the list that is returned // add this split to the list that is returned
CombineFileSplit thissplit = new CombineFileSplit(fl, offset, CombineFileSplit thissplit = new CombineFileSplit(fl, offset,
length, locations.toArray(new String[0])); length, locations.toArray(new String[0]));
@ -474,7 +518,8 @@ public abstract class CombineFileInputFormat<K, V>
/** /**
* information about one file from the File System * information about one file from the File System
*/ */
private static class OneFileInfo { @VisibleForTesting
static class OneFileInfo {
private long fileSize; // size of the file private long fileSize; // size of the file
private OneBlockInfo[] blocks; // all blocks in this file private OneBlockInfo[] blocks; // all blocks in this file
@ -545,45 +590,55 @@ public abstract class CombineFileInputFormat<K, V>
} }
blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]); blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]);
} }
populateBlockInfo(blocks, rackToBlocks, blockToNodes,
nodeToBlocks, rackToNodes);
}
}
@VisibleForTesting
static void populateBlockInfo(OneBlockInfo[] blocks,
HashMap<String, List<OneBlockInfo>> rackToBlocks,
HashMap<OneBlockInfo, String[]> blockToNodes,
HashMap<String, List<OneBlockInfo>> nodeToBlocks,
HashMap<String, Set<String>> rackToNodes) {
for (OneBlockInfo oneblock : blocks) {
// add this block to the block --> node locations map
blockToNodes.put(oneblock, oneblock.hosts);
for (OneBlockInfo oneblock : blocks) { // For blocks that do not have host/rack information,
// add this block to the block --> node locations map // assign to default rack.
blockToNodes.put(oneblock, oneblock.hosts); String[] racks = null;
if (oneblock.hosts.length == 0) {
racks = new String[]{NetworkTopology.DEFAULT_RACK};
} else {
racks = oneblock.racks;
}
// For blocks that do not have host/rack information, // add this block to the rack --> block map
// assign to default rack. for (int j = 0; j < racks.length; j++) {
String[] racks = null; String rack = racks[j];
if (oneblock.hosts.length == 0) { List<OneBlockInfo> blklist = rackToBlocks.get(rack);
racks = new String[]{NetworkTopology.DEFAULT_RACK}; if (blklist == null) {
} else { blklist = new ArrayList<OneBlockInfo>();
racks = oneblock.racks; rackToBlocks.put(rack, blklist);
} }
blklist.add(oneblock);
// add this block to the rack --> block map if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {
for (int j = 0; j < racks.length; j++) { // Add this host to rackToNodes map
String rack = racks[j]; addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);
List<OneBlockInfo> blklist = rackToBlocks.get(rack);
if (blklist == null) {
blklist = new ArrayList<OneBlockInfo>();
rackToBlocks.put(rack, blklist);
}
blklist.add(oneblock);
if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {
// Add this host to rackToNodes map
addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);
}
} }
}
// add this block to the node --> block map // add this block to the node --> block map
for (int j = 0; j < oneblock.hosts.length; j++) { for (int j = 0; j < oneblock.hosts.length; j++) {
String node = oneblock.hosts[j]; String node = oneblock.hosts[j];
List<OneBlockInfo> blklist = nodeToBlocks.get(node); List<OneBlockInfo> blklist = nodeToBlocks.get(node);
if (blklist == null) { if (blklist == null) {
blklist = new ArrayList<OneBlockInfo>(); blklist = new ArrayList<OneBlockInfo>();
nodeToBlocks.put(node, blklist); nodeToBlocks.put(node, blklist);
}
blklist.add(oneblock);
} }
blklist.add(oneblock);
} }
} }
} }
@ -600,7 +655,8 @@ public abstract class CombineFileInputFormat<K, V>
/** /**
* information about one block from the File System * information about one block from the File System
*/ */
private static class OneBlockInfo { @VisibleForTesting
static class OneBlockInfo {
Path onepath; // name of this file Path onepath; // name of this file
long offset; // offset in file long offset; // offset in file
long length; // length of this block long length; // length of this block

View File

@ -20,11 +20,14 @@ package org.apache.hadoop.mapreduce.lib.input;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.net.URI; import java.net.URI;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Set;
import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPOutputStream;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import junit.framework.Assert;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.*;
@ -42,9 +45,13 @@ import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneFileInfo;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.junit.Test; import org.junit.Test;
import com.google.common.collect.HashMultiset;
public class TestCombineFileInputFormat extends TestCase { public class TestCombineFileInputFormat extends TestCase {
private static final String rack1[] = new String[] { private static final String rack1[] = new String[] {
@ -476,23 +483,23 @@ public class TestCombineFileInputFormat extends TestCase {
assertEquals(BLOCKSIZE, fileSplit.getLength(1)); assertEquals(BLOCKSIZE, fileSplit.getLength(1));
assertEquals("host3.rack3.com", fileSplit.getLocations()[0]); assertEquals("host3.rack3.com", fileSplit.getLocations()[0]);
fileSplit = (CombineFileSplit) splits.get(1); fileSplit = (CombineFileSplit) splits.get(1);
assertEquals(file3.getName(), fileSplit.getPath(0).getName()); assertEquals(file2.getName(), fileSplit.getPath(0).getName());
assertEquals(2 * BLOCKSIZE, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getOffset(0));
assertEquals(BLOCKSIZE, fileSplit.getLength(0)); assertEquals(BLOCKSIZE, fileSplit.getLength(0));
assertEquals(file4.getName(), fileSplit.getPath(1).getName()); assertEquals(file2.getName(), fileSplit.getPath(1).getName());
assertEquals(0, fileSplit.getOffset(1)); assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
assertEquals(BLOCKSIZE, fileSplit.getLength(1)); assertEquals(BLOCKSIZE, fileSplit.getLength(1));
assertEquals("host3.rack3.com", fileSplit.getLocations()[0]); assertEquals("host2.rack2.com", fileSplit.getLocations()[0]);
fileSplit = (CombineFileSplit) splits.get(2); fileSplit = (CombineFileSplit) splits.get(2);
assertEquals(2, fileSplit.getNumPaths()); assertEquals(2, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length); assertEquals(1, fileSplit.getLocations().length);
assertEquals(file4.getName(), fileSplit.getPath(0).getName()); assertEquals(file1.getName(), fileSplit.getPath(0).getName());
assertEquals(BLOCKSIZE, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getOffset(0));
assertEquals(BLOCKSIZE, fileSplit.getLength(0)); assertEquals(BLOCKSIZE, fileSplit.getLength(0));
assertEquals(file4.getName(), fileSplit.getPath(1).getName()); assertEquals(file3.getName(), fileSplit.getPath(1).getName());
assertEquals(2 * BLOCKSIZE, fileSplit.getOffset(1)); assertEquals(2 * BLOCKSIZE, fileSplit.getOffset(1));
assertEquals(BLOCKSIZE, fileSplit.getLength(1)); assertEquals(BLOCKSIZE, fileSplit.getLength(1));
assertEquals("host3.rack3.com", fileSplit.getLocations()[0]); assertEquals("host1.rack1.com", fileSplit.getLocations()[0]);
// maximum split size is 3 blocks // maximum split size is 3 blocks
inFormat = new DummyInputFormat(); inFormat = new DummyInputFormat();
@ -504,7 +511,7 @@ public class TestCombineFileInputFormat extends TestCase {
for (InputSplit split : splits) { for (InputSplit split : splits) {
System.out.println("File split(Test5): " + split); System.out.println("File split(Test5): " + split);
} }
assertEquals(4, splits.size()); assertEquals(3, splits.size());
fileSplit = (CombineFileSplit) splits.get(0); fileSplit = (CombineFileSplit) splits.get(0);
assertEquals(3, fileSplit.getNumPaths()); assertEquals(3, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length); assertEquals(1, fileSplit.getLocations().length);
@ -519,32 +526,28 @@ public class TestCombineFileInputFormat extends TestCase {
assertEquals(BLOCKSIZE, fileSplit.getLength(2)); assertEquals(BLOCKSIZE, fileSplit.getLength(2));
assertEquals("host3.rack3.com", fileSplit.getLocations()[0]); assertEquals("host3.rack3.com", fileSplit.getLocations()[0]);
fileSplit = (CombineFileSplit) splits.get(1); fileSplit = (CombineFileSplit) splits.get(1);
assertEquals(file4.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0));
assertEquals(BLOCKSIZE, fileSplit.getLength(0));
assertEquals(file4.getName(), fileSplit.getPath(1).getName());
assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
assertEquals(BLOCKSIZE, fileSplit.getLength(1));
assertEquals(file4.getName(), fileSplit.getPath(2).getName());
assertEquals( 2 * BLOCKSIZE, fileSplit.getOffset(2));
assertEquals(BLOCKSIZE, fileSplit.getLength(2));
assertEquals("host3.rack3.com", fileSplit.getLocations()[0]);
fileSplit = (CombineFileSplit) splits.get(2);
assertEquals(2, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length);
assertEquals(file2.getName(), fileSplit.getPath(0).getName()); assertEquals(file2.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getOffset(0));
assertEquals(BLOCKSIZE, fileSplit.getLength(0)); assertEquals(BLOCKSIZE, fileSplit.getLength(0));
assertEquals(file2.getName(), fileSplit.getPath(1).getName()); assertEquals(file2.getName(), fileSplit.getPath(1).getName());
assertEquals(BLOCKSIZE, fileSplit.getOffset(1)); assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
assertEquals(BLOCKSIZE, fileSplit.getLength(1)); assertEquals(BLOCKSIZE, fileSplit.getLength(1));
assertEquals(file4.getName(), fileSplit.getPath(2).getName());
assertEquals(0, fileSplit.getOffset(2));
assertEquals(BLOCKSIZE, fileSplit.getLength(2));
assertEquals("host2.rack2.com", fileSplit.getLocations()[0]); assertEquals("host2.rack2.com", fileSplit.getLocations()[0]);
fileSplit = (CombineFileSplit) splits.get(3); fileSplit = (CombineFileSplit) splits.get(2);
assertEquals(1, fileSplit.getNumPaths()); assertEquals(3, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length); assertEquals(1, fileSplit.getLocations().length);
assertEquals(file1.getName(), fileSplit.getPath(0).getName()); assertEquals(file1.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getOffset(0));
assertEquals(BLOCKSIZE, fileSplit.getLength(0)); assertEquals(BLOCKSIZE, fileSplit.getLength(0));
assertEquals(file4.getName(), fileSplit.getPath(1).getName());
assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
assertEquals(BLOCKSIZE, fileSplit.getLength(1));
assertEquals(file4.getName(), fileSplit.getPath(2).getName());
assertEquals(2*BLOCKSIZE, fileSplit.getOffset(2));
assertEquals(BLOCKSIZE, fileSplit.getLength(2));
assertEquals("host1.rack1.com", fileSplit.getLocations()[0]); assertEquals("host1.rack1.com", fileSplit.getLocations()[0]);
// maximum split size is 4 blocks // maximum split size is 4 blocks
@ -713,6 +716,56 @@ public class TestCombineFileInputFormat extends TestCase {
DFSTestUtil.waitReplication(fileSys, name, replication); DFSTestUtil.waitReplication(fileSys, name, replication);
} }
public void testNodeInputSplit() throws IOException, InterruptedException {
// Regression test for MAPREDUCE-4892. There are 2 nodes with all blocks on
// both nodes. The grouping ensures that both nodes get splits instead of
// just the first node
DummyInputFormat inFormat = new DummyInputFormat();
int numBlocks = 12;
long totLength = 0;
long blockSize = 100;
long maxSize = 200;
long minSizeNode = 50;
long minSizeRack = 50;
String[] locations = { "h1", "h2" };
String[] racks = new String[0];
Path path = new Path("hdfs://file");
OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];
for(int i=0; i<numBlocks; ++i) {
blocks[i] = new OneBlockInfo(path, i*blockSize, blockSize, locations, racks);
totLength += blockSize;
}
List<InputSplit> splits = new ArrayList<InputSplit>();
HashMap<String, Set<String>> rackToNodes =
new HashMap<String, Set<String>>();
HashMap<String, List<OneBlockInfo>> rackToBlocks =
new HashMap<String, List<OneBlockInfo>>();
HashMap<OneBlockInfo, String[]> blockToNodes =
new HashMap<OneBlockInfo, String[]>();
HashMap<String, List<OneBlockInfo>> nodeToBlocks =
new HashMap<String, List<OneBlockInfo>>();
OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes,
nodeToBlocks, rackToNodes);
inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength,
maxSize, minSizeNode, minSizeRack, splits);
int expectedSplitCount = (int)(totLength/maxSize);
Assert.assertEquals(expectedSplitCount, splits.size());
HashMultiset<String> nodeSplits = HashMultiset.create();
for(int i=0; i<expectedSplitCount; ++i) {
InputSplit inSplit = splits.get(i);
Assert.assertEquals(maxSize, inSplit.getLength());
Assert.assertEquals(1, inSplit.getLocations().length);
nodeSplits.add(inSplit.getLocations()[0]);
}
Assert.assertEquals(3, nodeSplits.count(locations[0]));
Assert.assertEquals(3, nodeSplits.count(locations[1]));
}
public void testSplitPlacementForCompressedFiles() throws Exception { public void testSplitPlacementForCompressedFiles() throws Exception {
MiniDFSCluster dfs = null; MiniDFSCluster dfs = null;
FileSystem fileSys = null; FileSystem fileSys = null;
@ -889,24 +942,24 @@ public class TestCombineFileInputFormat extends TestCase {
assertEquals(f3.getLen(), fileSplit.getLength(0)); assertEquals(f3.getLen(), fileSplit.getLength(0));
assertEquals(hosts3[0], fileSplit.getLocations()[0]); // should be on r3 assertEquals(hosts3[0], fileSplit.getLocations()[0]); // should be on r3
fileSplit = (CombineFileSplit) splits.get(1); fileSplit = (CombineFileSplit) splits.get(1);
assertEquals(file4.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0));
assertEquals(f4.getLen(), fileSplit.getLength(0));
assertEquals(hosts3[0], fileSplit.getLocations()[0]); // should be on r3
fileSplit = (CombineFileSplit) splits.get(2);
assertEquals(1, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length);
assertEquals(file2.getName(), fileSplit.getPath(0).getName()); assertEquals(file2.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getOffset(0));
assertEquals(f2.getLen(), fileSplit.getLength(0)); assertEquals(f2.getLen(), fileSplit.getLength(0));
assertEquals(hosts2[0], fileSplit.getLocations()[0]); // should be on r2 assertEquals(hosts2[0], fileSplit.getLocations()[0]); // should be on r3
fileSplit = (CombineFileSplit) splits.get(3); fileSplit = (CombineFileSplit) splits.get(2);
assertEquals(1, fileSplit.getNumPaths()); assertEquals(1, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length); assertEquals(1, fileSplit.getLocations().length);
assertEquals(file1.getName(), fileSplit.getPath(0).getName()); assertEquals(file1.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getOffset(0));
assertEquals(f1.getLen(), fileSplit.getLength(0)); assertEquals(f1.getLen(), fileSplit.getLength(0));
assertEquals(hosts1[0], fileSplit.getLocations()[0]); // should be on r1 assertEquals(hosts1[0], fileSplit.getLocations()[0]); // should be on r2
fileSplit = (CombineFileSplit) splits.get(3);
assertEquals(1, fileSplit.getNumPaths());
assertEquals(1, fileSplit.getLocations().length);
assertEquals(file4.getName(), fileSplit.getPath(0).getName());
assertEquals(0, fileSplit.getOffset(0));
assertEquals(f4.getLen(), fileSplit.getLength(0));
assertEquals(hosts3[0], fileSplit.getLocations()[0]); // should be on r1
// maximum split size is twice file1's length // maximum split size is twice file1's length
inFormat = new DummyInputFormat(); inFormat = new DummyInputFormat();