HBASE-11562 CopyTable should provide an option to shuffle the mapper tasks (Jean-Marc Spaggiari)

This commit is contained in:
stack 2014-10-28 11:28:54 -07:00
parent bb81b9fde5
commit 64b6109ce9
2 changed files with 56 additions and 22 deletions

View File

@ -65,6 +65,7 @@ public class CopyTable extends Configured implements Tool {
String peerAddress = null;
String families = null;
boolean allCells = false;
static boolean shuffle = false;
boolean bulkload = false;
Path bulkloadDir = null;
@ -98,6 +99,9 @@ public class CopyTable extends Configured implements Tool {
if (allCells) {
scan.setRaw(true);
}
if (shuffle) {
job.getConfiguration().set(TableInputFormat.SHUFFLE_MAPS, "true");
}
if (versions >= 0) {
scan.setMaxVersions(versions);
}
@ -286,6 +290,11 @@ public class CopyTable extends Configured implements Tool {
continue;
}
if (cmd.startsWith("--shuffle")) {
shuffle = true;
continue;
}
if (i == args.length-1) {
tableName = cmd;
} else {

View File

@ -19,6 +19,8 @@
package org.apache.hadoop.hbase.mapreduce;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -33,6 +35,8 @@ import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;
@ -80,6 +84,8 @@ implements Configurable {
public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows";
/** Set the maximum number of values to return for each call to next(). */
public static final String SCAN_BATCHSIZE = "hbase.mapreduce.scan.batchsize";
/** Specify if we have to shuffle the map tasks. */
public static final String SHUFFLE_MAPS = "hbase.mapreduce.inputtable.shufflemaps";
/** The configuration. */
private Configuration conf = null;
@ -210,6 +216,25 @@ implements Configurable {
}
}
/**
* Calculates the splits that will serve as input for the map tasks. The
* number of splits matches the number of regions in a table. Splits are shuffled if
* required.
* @param context The current job context.
* @return The list of input splits.
* @throws IOException When creating the list of splits fails.
* @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
* org.apache.hadoop.mapreduce.JobContext)
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
List<InputSplit> splits = super.getSplits(context);
if ((conf.get(SHUFFLE_MAPS) != null) && "true".equals(conf.get(SHUFFLE_MAPS).toLowerCase())) {
Collections.shuffle(splits);
}
return splits;
}
/**
* Convenience method to parse a string representation of an array of column specifiers.
*