From 4e80c4cd5ec48c1325f1c8a4e7771cf7137682db Mon Sep 17 00:00:00 2001 From: Harsh J Date: Wed, 18 Mar 2015 15:34:44 +0530 Subject: [PATCH] MAPREDUCE-5807. Print usage for TeraSort job. Contributed by Rohith. (cherry picked from commit 9d72f939759f407796ecb4715c2dc2f0d36d5578) --- hadoop-mapreduce-project/CHANGES.txt | 2 + .../hadoop/examples/terasort/TeraGen.java | 6 +- .../examples/terasort/TeraInputFormat.java | 16 ++-- .../examples/terasort/TeraOutputFormat.java | 8 +- .../examples/terasort/TeraScheduler.java | 1 - .../hadoop/examples/terasort/TeraSort.java | 28 +++++-- .../examples/terasort/TeraSortConfigKeys.java | 77 +++++++++++++++++++ .../examples/terasort/TestTeraSort.java | 5 ++ 8 files changed, 123 insertions(+), 20 deletions(-) create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index abcfe8c875b..5cd974cf6b0 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -8,6 +8,8 @@ Release 2.8.0 - UNRELEASED IMPROVEMENTS + MAPREDUCE-5807. Print usage by TeraSort job. (Rohith via harsh) + MAPREDUCE-4653. TestRandomAlgorithm has an unused "import" statement. (Amir Sanjar via harsh) diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java index e8b65038e79..d7d751a6f6f 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraGen.java @@ -70,7 +70,6 @@ public class TeraGen extends Configured implements Tool { public static enum Counters {CHECKSUM} - public static final String NUM_ROWS = "mapreduce.terasort.num-rows"; /** * An input format that assigns ranges of longs to each mapper. */ @@ -189,11 +188,12 @@ public class TeraGen extends Configured implements Tool { } static long getNumberOfRows(JobContext job) { - return job.getConfiguration().getLong(NUM_ROWS, 0); + return job.getConfiguration().getLong(TeraSortConfigKeys.NUM_ROWS.key(), + TeraSortConfigKeys.DEFAULT_NUM_ROWS); } static void setNumberOfRows(Job job, long numRows) { - job.getConfiguration().setLong(NUM_ROWS, numRows); + job.getConfiguration().setLong(TeraSortConfigKeys.NUM_ROWS.key(), numRows); } /** diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java index 88b12dd1ff4..20ce8ef2b60 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraInputFormat.java @@ -50,10 +50,6 @@ import org.apache.hadoop.util.StringUtils; public class TeraInputFormat extends FileInputFormat { static final String PARTITION_FILENAME = "_partition.lst"; - private static final String NUM_PARTITIONS = - "mapreduce.terasort.num.partitions"; - private static final String SAMPLE_SIZE = - "mapreduce.terasort.partitions.sample"; static final int KEY_LENGTH = 10; static final int VALUE_LENGTH = 90; static final int RECORD_LENGTH = KEY_LENGTH + VALUE_LENGTH; @@ -123,11 +119,16 @@ public class TeraInputFormat extends FileInputFormat { final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); - long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); + long sampleSize = + conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), + TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE); final List splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); - int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); + int samples = + Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), + TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), + splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; @@ -294,7 +295,8 @@ public class TeraInputFormat extends FileInputFormat { lastResult = super.getSplits(job); t2 = System.currentTimeMillis(); System.out.println("Spent " + (t2 - t1) + "ms computing base-splits."); - if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) { + if (job.getConfiguration().getBoolean(TeraSortConfigKeys.USE_TERA_SCHEDULER.key(), + TeraSortConfigKeys.DEFAULT_USE_TERA_SCHEDULER)) { TeraScheduler scheduler = new TeraScheduler( lastResult.toArray(new FileSplit[0]), job.getConfiguration()); lastResult = scheduler.getNewFileSplits(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java index 5814c8db689..915acde4f90 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraOutputFormat.java @@ -40,21 +40,23 @@ import org.apache.hadoop.mapreduce.security.TokenCache; * An output format that writes the key and value appended together. */ public class TeraOutputFormat extends FileOutputFormat { - static final String FINAL_SYNC_ATTRIBUTE = "mapreduce.terasort.final.sync"; private OutputCommitter committer = null; /** * Set the requirement for a final sync before the stream is closed. */ static void setFinalSync(JobContext job, boolean newValue) { - job.getConfiguration().setBoolean(FINAL_SYNC_ATTRIBUTE, newValue); + job.getConfiguration().setBoolean( + TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(), newValue); } /** * Does the user want a final sync at close? */ public static boolean getFinalSync(JobContext job) { - return job.getConfiguration().getBoolean(FINAL_SYNC_ATTRIBUTE, false); + return job.getConfiguration().getBoolean( + TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(), + TeraSortConfigKeys.DEFAULT_FINAL_SYNC_ATTRIBUTE); } static class TeraRecordWriter extends RecordWriter { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java index 7095dd7d28e..30b50d8cf15 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraScheduler.java @@ -31,7 +31,6 @@ import org.apache.hadoop.mapreduce.server.tasktracker.TTConfig; import com.google.common.base.Charsets; class TeraScheduler { - static String USE = "mapreduce.terasort.use.terascheduler"; private static final Log LOG = LogFactory.getLog(TeraScheduler.class); private Split[] splits; private List hosts = new ArrayList(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java index 6022f6c5026..fe771fc46d1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSort.java @@ -48,8 +48,6 @@ import org.apache.hadoop.util.ToolRunner; */ public class TeraSort extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(TeraSort.class); - static String SIMPLE_PARTITIONER = "mapreduce.terasort.simplepartitioner"; - static String OUTPUT_REPLICATION = "mapreduce.terasort.output.replication"; /** * A partitioner that splits text keys into roughly equal partitions @@ -262,22 +260,40 @@ public class TeraSort extends Configured implements Tool { } public static boolean getUseSimplePartitioner(JobContext job) { - return job.getConfiguration().getBoolean(SIMPLE_PARTITIONER, false); + return job.getConfiguration().getBoolean( + TeraSortConfigKeys.USE_SIMPLE_PARTITIONER.key(), + TeraSortConfigKeys.DEFAULT_USE_SIMPLE_PARTITIONER); } public static void setUseSimplePartitioner(Job job, boolean value) { - job.getConfiguration().setBoolean(SIMPLE_PARTITIONER, value); + job.getConfiguration().setBoolean( + TeraSortConfigKeys.USE_SIMPLE_PARTITIONER.key(), value); } public static int getOutputReplication(JobContext job) { - return job.getConfiguration().getInt(OUTPUT_REPLICATION, 1); + return job.getConfiguration().getInt( + TeraSortConfigKeys.OUTPUT_REPLICATION.key(), + TeraSortConfigKeys.DEFAULT_OUTPUT_REPLICATION); } public static void setOutputReplication(Job job, int value) { - job.getConfiguration().setInt(OUTPUT_REPLICATION, value); + job.getConfiguration().setInt(TeraSortConfigKeys.OUTPUT_REPLICATION.key(), + value); + } + + private static void usage() throws IOException { + System.err.println("Usage: terasort [-Dproperty=value] "); + System.err.println("TeraSort configurations are:"); + for (TeraSortConfigKeys teraSortConfigKeys : TeraSortConfigKeys.values()) { + System.err.println(teraSortConfigKeys.toString()); + } } public int run(String[] args) throws Exception { + if (args.length != 2) { + usage(); + return 2; + } LOG.info("starting"); Job job = Job.getInstance(getConf()); Path inputDir = new Path(args[0]); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java new file mode 100644 index 00000000000..0822a50f2db --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/TeraSortConfigKeys.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.examples.terasort; + +import org.apache.hadoop.classification.InterfaceAudience.Private; +import org.apache.hadoop.classification.InterfaceStability.Unstable; + +/** + *

+ * TeraSort configurations. + *

+ */ +@Private +@Unstable +public enum TeraSortConfigKeys { + + NUM_ROWS("mapreduce.terasort.num-rows", + "Number of rows to generate during teragen."), + + NUM_PARTITIONS("mapreduce.terasort.num.partitions", + "Number of partitions used for sampling."), + + SAMPLE_SIZE("mapreduce.terasort.partitions.sample", + "Sample size for each partition."), + + FINAL_SYNC_ATTRIBUTE("mapreduce.terasort.final.sync", + "Perform a disk-persisting hsync at end of every file-write."), + + USE_TERA_SCHEDULER("mapreduce.terasort.use.terascheduler", + "Use TeraScheduler for computing input split distribution."), + + USE_SIMPLE_PARTITIONER("mapreduce.terasort.simplepartitioner", + "Use SimplePartitioner instead of TotalOrderPartitioner."), + + OUTPUT_REPLICATION("mapreduce.terasort.output.replication", + "Replication factor to use for output data files."); + + private String confName; + private String description; + + TeraSortConfigKeys(String configName, String description) { + this.confName = configName; + this.description = description; + } + + public String key() { + return this.confName; + } + + public String toString() { + return "<" + confName + "> " + description; + } + + public static final long DEFAULT_NUM_ROWS = 0L; + public static final int DEFAULT_NUM_PARTITIONS = 10; + public static final long DEFAULT_SAMPLE_SIZE = 100000L; + public static final boolean DEFAULT_FINAL_SYNC_ATTRIBUTE = false; + public static final boolean DEFAULT_USE_TERA_SCHEDULER = true; + public static final boolean DEFAULT_USE_SIMPLE_PARTITIONER = false; + public static final int DEFAULT_OUTPUT_REPLICATION = 1; +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java index 1956872b7ed..349208999d3 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/terasort/TestTeraSort.java @@ -104,4 +104,9 @@ public class TestTeraSort extends HadoopTestCase { TERA_OUTPUT_PATH); } + public void testTeraSortWithLessThanTwoArgs() throws Exception { + String[] args = new String[1]; + assertEquals(new TeraSort().run(args), 2); + } + }