MAPREDUCE-5807. Print usage for TeraSort job. Contributed by Rohith.

2015-03-18 15:34:44 +05:30 · 2015-03-18 15:34:44 +05:30 · 9d72f93975
parent 34117325b2
commit 9d72f93975
8 changed files with 123 additions and 20 deletions
--- a/hadoop-mapreduce-project/CHANGES.txt
+++ b/hadoop-mapreduce-project/CHANGES.txt
@ -253,6 +253,8 @@ Release 2.8.0 - UNRELEASED

  IMPROVEMENTS

+    MAPREDUCE-5807. Print usage by TeraSort job. (Rohith via harsh)
+
    MAPREDUCE-4653. TestRandomAlgorithm has an unused "import" statement.
    (Amir Sanjar via harsh)

--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java
@ -70,7 +70,6 @@ public class TeraGen extends Configured implements Tool {

  public static enum Counters {CHECKSUM}

-  public static final String NUM_ROWS = "mapreduce.terasort.num-rows";
  /**
   * An input format that assigns ranges of longs to each mapper.
   */
@ -189,11 +188,12 @@ public class TeraGen extends Configured implements Tool {
  }
  
  static long getNumberOfRows(JobContext job) {
-    return job.getConfiguration().getLong(NUM_ROWS, 0);
+    return job.getConfiguration().getLong(TeraSortConfigKeys.NUM_ROWS.key(),
+        TeraSortConfigKeys.DEFAULT_NUM_ROWS);
  }
  
  static void setNumberOfRows(Job job, long numRows) {
-    job.getConfiguration().setLong(NUM_ROWS, numRows);
+    job.getConfiguration().setLong(TeraSortConfigKeys.NUM_ROWS.key(), numRows);
  }

  /**
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java
@ -50,10 +50,6 @@ import org.apache.hadoop.util.StringUtils;
 public class TeraInputFormat extends FileInputFormat<Text,Text> {

  static final String PARTITION_FILENAME = "_partition.lst";
-  private static final String NUM_PARTITIONS = 
-    "mapreduce.terasort.num.partitions";
-  private static final String SAMPLE_SIZE = 
-    "mapreduce.terasort.partitions.sample";
  static final int KEY_LENGTH = 10;
  static final int VALUE_LENGTH = 90;
  static final int RECORD_LENGTH = KEY_LENGTH + VALUE_LENGTH;
@ -123,11 +119,16 @@ public class TeraInputFormat extends FileInputFormat<Text,Text> {
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
-    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
+    long sampleSize =
+        conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(),
+            TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
-    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
+    int samples =
+        Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(),
+                             TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS),
+            splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
@ -294,7 +295,8 @@ public class TeraInputFormat extends FileInputFormat<Text,Text> {
    lastResult = super.getSplits(job);
    t2 = System.currentTimeMillis();
    System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
-    if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
+    if (job.getConfiguration().getBoolean(TeraSortConfigKeys.USE_TERA_SCHEDULER.key(),
+                                          TeraSortConfigKeys.DEFAULT_USE_TERA_SCHEDULER)) {
      TeraScheduler scheduler = new TeraScheduler(
        lastResult.toArray(new FileSplit[0]), job.getConfiguration());
      lastResult = scheduler.getNewFileSplits();
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java
@ -40,21 +40,23 @@ import org.apache.hadoop.mapreduce.security.TokenCache;
 * An output format that writes the key and value appended together.
 */
 public class TeraOutputFormat extends FileOutputFormat<Text,Text> {
-  static final String FINAL_SYNC_ATTRIBUTE = "mapreduce.terasort.final.sync";
  private OutputCommitter committer = null;

  /**
   * Set the requirement for a final sync before the stream is closed.
   */
  static void setFinalSync(JobContext job, boolean newValue) {
-    job.getConfiguration().setBoolean(FINAL_SYNC_ATTRIBUTE, newValue);
+    job.getConfiguration().setBoolean(
+        TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(), newValue);
  }

  /**
   * Does the user want a final sync at close?
   */
  public static boolean getFinalSync(JobContext job) {
-    return job.getConfiguration().getBoolean(FINAL_SYNC_ATTRIBUTE, false);
+    return job.getConfiguration().getBoolean(
+        TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(),
+        TeraSortConfigKeys.DEFAULT_FINAL_SYNC_ATTRIBUTE);
  }

  static class TeraRecordWriter extends RecordWriter<Text,Text> {
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java
@ -31,7 +31,6 @@ import org.apache.hadoop.mapreduce.server.tasktracker.TTConfig;
 import com.google.common.base.Charsets;

 class TeraScheduler {
-  static String USE = "mapreduce.terasort.use.terascheduler";
  private static final Log LOG = LogFactory.getLog(TeraScheduler.class);
  private Split[] splits;
  private List<Host> hosts = new ArrayList<Host>();
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java
@ -48,8 +48,6 @@ import org.apache.hadoop.util.ToolRunner;
 */
 public class TeraSort extends Configured implements Tool {
  private static final Log LOG = LogFactory.getLog(TeraSort.class);
-  static String SIMPLE_PARTITIONER = "mapreduce.terasort.simplepartitioner";
-  static String OUTPUT_REPLICATION = "mapreduce.terasort.output.replication";

  /**
   * A partitioner that splits text keys into roughly equal partitions
@ -262,22 +260,40 @@ public class TeraSort extends Configured implements Tool {
  }

  public static boolean getUseSimplePartitioner(JobContext job) {
-    return job.getConfiguration().getBoolean(SIMPLE_PARTITIONER, false);
+    return job.getConfiguration().getBoolean(
+        TeraSortConfigKeys.USE_SIMPLE_PARTITIONER.key(),
+        TeraSortConfigKeys.DEFAULT_USE_SIMPLE_PARTITIONER);
  }

  public static void setUseSimplePartitioner(Job job, boolean value) {
-    job.getConfiguration().setBoolean(SIMPLE_PARTITIONER, value);
+    job.getConfiguration().setBoolean(
+        TeraSortConfigKeys.USE_SIMPLE_PARTITIONER.key(), value);
  }

  public static int getOutputReplication(JobContext job) {
-    return job.getConfiguration().getInt(OUTPUT_REPLICATION, 1);
+    return job.getConfiguration().getInt(
+        TeraSortConfigKeys.OUTPUT_REPLICATION.key(),
+        TeraSortConfigKeys.DEFAULT_OUTPUT_REPLICATION);
  }

  public static void setOutputReplication(Job job, int value) {
-    job.getConfiguration().setInt(OUTPUT_REPLICATION, value);
+    job.getConfiguration().setInt(TeraSortConfigKeys.OUTPUT_REPLICATION.key(),
+        value);
+  }
+
+  private static void usage() throws IOException {
+    System.err.println("Usage: terasort [-Dproperty=value] <in> <out>");
+    System.err.println("TeraSort configurations are:");
+    for (TeraSortConfigKeys teraSortConfigKeys : TeraSortConfigKeys.values()) {
+      System.err.println(teraSortConfigKeys.toString());
+    }
  }

  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      usage();
+      return 2;
+    }
    LOG.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java
@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.examples.terasort;
+
+import org.apache.hadoop.classification.InterfaceAudience.Private;
+import org.apache.hadoop.classification.InterfaceStability.Unstable;
+
+/**
+ * <p>
+ * TeraSort configurations.
+ * </p>
+ */
+@Private
+@Unstable
+public enum TeraSortConfigKeys {
+
+  NUM_ROWS("mapreduce.terasort.num-rows",
+      "Number of rows to generate during teragen."),
+
+  NUM_PARTITIONS("mapreduce.terasort.num.partitions",
+      "Number of partitions used for sampling."),
+
+  SAMPLE_SIZE("mapreduce.terasort.partitions.sample",
+      "Sample size for each partition."),
+
+  FINAL_SYNC_ATTRIBUTE("mapreduce.terasort.final.sync",
+      "Perform a disk-persisting hsync at end of every file-write."),
+
+  USE_TERA_SCHEDULER("mapreduce.terasort.use.terascheduler",
+      "Use TeraScheduler for computing input split distribution."),
+
+  USE_SIMPLE_PARTITIONER("mapreduce.terasort.simplepartitioner",
+      "Use SimplePartitioner instead of TotalOrderPartitioner."),
+
+  OUTPUT_REPLICATION("mapreduce.terasort.output.replication",
+      "Replication factor to use for output data files.");
+
+  private String confName;
+  private String description;
+
+  TeraSortConfigKeys(String configName, String description) {
+    this.confName = configName;
+    this.description = description;
+  }
+
+  public String key() {
+    return this.confName;
+  }
+
+  public String toString() {
+    return "<" + confName + ">     " + description;
+  }
+
+  public static final long DEFAULT_NUM_ROWS = 0L;
+  public static final int DEFAULT_NUM_PARTITIONS = 10;
+  public static final long DEFAULT_SAMPLE_SIZE = 100000L;
+  public static final boolean DEFAULT_FINAL_SYNC_ATTRIBUTE = false;
+  public static final boolean DEFAULT_USE_TERA_SCHEDULER = true;
+  public static final boolean DEFAULT_USE_SIMPLE_PARTITIONER = false;
+  public static final int DEFAULT_OUTPUT_REPLICATION = 1;
+}
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java
@ -104,4 +104,9 @@ public class TestTeraSort extends HadoopTestCase {
      TERA_OUTPUT_PATH);
  }

+  public void testTeraSortWithLessThanTwoArgs() throws Exception {
+    String[] args = new String[1];
+    assertEquals(new TeraSort().run(args), 2);
+  }
+
 }