diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt
index 893096d18d9..833043bf1c0 100644
--- a/hadoop-mapreduce-project/CHANGES.txt
+++ b/hadoop-mapreduce-project/CHANGES.txt
@@ -155,6 +155,9 @@ Release 2.0.3-alpha - Unreleased
MAPREDUCE-3678. The Map tasks logs should have the value of input
split it processed. (harsh)
+ MAPREDUCE-4616. Improve javadoc for MultipleOutputs. (Tony Burton via
+ acmurthy)
+
OPTIMIZATIONS
BUG FIXES
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
index 2619e207358..c6c49fa6f5a 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/LazyOutputFormat.java
@@ -32,7 +32,10 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
/**
- * A Convenience class that creates output lazily.
+ * A Convenience class that creates output lazily.
+ * Use in conjuction with org.apache.hadoop.mapreduce.lib.output.MultipleOutputs to recreate the
+ * behaviour of org.apache.hadoop.mapred.lib.MultipleTextOutputFormat (etc) of the old Hadoop API.
+ * See {@link MultipleOutputs} documentation for more information.
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java
index 0db94e0475d..7974b78fb89 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java
@@ -20,7 +20,10 @@ package org.apache.hadoop.mapreduce.lib.output;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
@@ -37,6 +40,7 @@ import java.util.*;
* Each additional output, or named output, may be configured with its own
* OutputFormat
, with its own key class and with its own value
* class.
+ *
* Case two: to write data to different files provided by user @@ -107,6 +111,64 @@ import java.util.*; * * } * + * + *
+ * When used in conjuction with org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat, + * MultipleOutputs can mimic the behaviour of MultipleTextOutputFormat and MultipleSequenceFileOutputFormat + * from the old Hadoop API - ie, output can be written from the Reducer to more than one location. + *
+ * + *
+ * Use MultipleOutputs.write(KEYOUT key, VALUEOUT value, String baseOutputPath)
to write key and
+ * value to a path specified by baseOutputPath
, with no need to specify a named output:
+ *
+ * private MultipleOutputs+ * + *out; + * + * public void setup(Context context) { + * out = new MultipleOutputs (context); + * ... + * } + * + * public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { + * for (Text t : values) { + * out.write(key, t, generateFileName(<parameter list...>)); + * } + * } + * + * protected void cleanup(Context context) throws IOException, InterruptedException { + * out.close(); + * } + *
+ * Use your own code in generateFileName()
to create a custom path to your results.
+ * '/' characters in baseOutputPath
will be translated into directory levels in your file system.
+ * Also, append your custom-generated path with "part" or similar, otherwise your output will be -00000, -00001 etc.
+ * No call to context.write()
is necessary. See example generateFileName()
code below.
+ *
+ * private String generateFileName(Text k) { + * // expect Text k in format "Surname|Forename" + * String[] kStr = k.toString().split("\\|"); + * + * String sName = kStr[0]; + * String fName = kStr[1]; + * + * // example for k = Smith|John + * // output written to /user/hadoop/path/to/output/Smith/John-r-00000 (etc) + * return sName + "/" + fName; + * } + *+ * + *
+ * Using MultipleOutputs in this way will still create zero-sized default output, eg part-00000.
+ * To prevent this use LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
+ * instead of job.setOutputFormatClass(TextOutputFormat.class);
in your Hadoop job configuration.
+ *