HBASE-11562 CopyTable should provide an option to shuffle the mapper tasks (Jean-Marc Spaggiari)

2014-10-28 11:28:54 -07:00 · 2014-10-28 11:28:54 -07:00 · 64b6109ce9
parent bb81b9fde5
commit 64b6109ce9
2 changed files with 56 additions and 22 deletions
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/CopyTable.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/CopyTable.java
@ -65,6 +65,7 @@ public class CopyTable extends Configured implements Tool {
  String peerAddress = null;
  String families = null;
  boolean allCells = false;
+  static boolean shuffle = false;

  boolean bulkload = false;
  Path bulkloadDir = null;
@ -98,6 +99,9 @@ public class CopyTable extends Configured implements Tool {
    if (allCells) {
      scan.setRaw(true);
    }
+    if (shuffle) {
+      job.getConfiguration().set(TableInputFormat.SHUFFLE_MAPS, "true");
+    }
    if (versions >= 0) {
      scan.setMaxVersions(versions);
    }
@ -286,6 +290,11 @@ public class CopyTable extends Configured implements Tool {
          continue;
        }

+        if (cmd.startsWith("--shuffle")) {
+          shuffle = true;
+          continue;
+        }
+
        if (i == args.length-1) {
          tableName = cmd;
        } else {
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormat.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormat.java
@ -19,6 +19,8 @@
 package org.apache.hadoop.hbase.mapreduce;

 import java.io.IOException;
+import java.util.Collections;
+import java.util.List;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@ -33,6 +35,8 @@ import org.apache.hadoop.hbase.client.ConnectionFactory;
 import org.apache.hadoop.hbase.client.RegionLocator;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.util.StringUtils;
@ -80,6 +84,8 @@ implements Configurable {
  public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows";
  /** Set the maximum number of values to return for each call to next(). */
  public static final String SCAN_BATCHSIZE = "hbase.mapreduce.scan.batchsize";
+  /** Specify if we have to shuffle the map tasks. */
+  public static final String SHUFFLE_MAPS = "hbase.mapreduce.inputtable.shufflemaps";

  /** The configuration. */
  private Configuration conf = null;
@ -210,6 +216,25 @@ implements Configurable {
    }
  }

+  /**
+   * Calculates the splits that will serve as input for the map tasks. The
+   * number of splits matches the number of regions in a table. Splits are shuffled if
+   * required.
+   * @param context  The current job context.
+   * @return The list of input splits.
+   * @throws IOException When creating the list of splits fails.
+   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
+   *   org.apache.hadoop.mapreduce.JobContext)
+   */
+  @Override
+  public List<InputSplit> getSplits(JobContext context) throws IOException {
+    List<InputSplit> splits = super.getSplits(context);
+    if ((conf.get(SHUFFLE_MAPS) != null) && "true".equals(conf.get(SHUFFLE_MAPS).toLowerCase())) {
+      Collections.shuffle(splits);
+    }
+    return splits;
+  }
+
  /**
   * Convenience method to parse a string representation of an array of column specifiers.
   *