diff --git a/src/java/org/apache/hadoop/hbase/mapred/package-info.java b/src/java/org/apache/hadoop/hbase/mapred/package-info.java new file mode 100644 index 00000000000..bab24b1f9da --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/mapred/package-info.java @@ -0,0 +1,198 @@ +/** +Provides HBase MapReduce +Input/OutputFormats, a table indexing MapReduce job, and utility + +
MapReduce jobs deployed to a MapReduce cluster do not by default have access
+to the HBase configuration under $HBASE_CONF_DIR
nor to HBase classes.
+You could add hbase-site.xml
to $HADOOP_HOME/conf and add
+hbase-X.X.X.jar
+to the $HADOOP_HOME/lib
and copy these changes across your cluster but the
+cleanest means of adding hbase configuration and classes to the cluster
+CLASSPATH
+is by uncommenting HADOOP_CLASSPATH
in $HADOOP_HOME/conf/hadoop-env.sh
+and adding the path to the hbase jar and $HBASE_CONF_DIR
directory.
+Then copy the amended configuration around the cluster.
+You'll probably need to restart the MapReduce cluster if you want it to notice
+the new configuration (You may not have to).
+
For example, here is how you would amend Expand $HBASE_HOME appropriately in the in accordance with your local environment. This is how you would run the PerformanceEvaluation MR job to put up 4 clients:
+
+hadoop-env.sh
adding
+the hbase jar, conf, and the PerformanceEvaluation
class from hbase test
+classes to the hadoop CLASSPATH
+
+
+
+# Extra Java CLASSPATH elements. Optional.
+# export HADOOP_CLASSPATH=
+export HADOOP_CLASSPATH=$HBASE_HOME/build/test:$HBASE_HOME/build/hbase-X.X.X.jar:$HBASE_HOME/conf
+
+The PerformanceEvaluation class wil be found on the CLASSPATH because you added $HBASE_HOME/build/test to HADOOP_CLASSPATH
+$HADOOP_HOME/bin/hadoop org.apache.hadoop.hbase.PerformanceEvaluation sequentialWrite 4
HBase can be used as a data source, {@link org.apache.hadoop.hbase.mapred.TableInputFormat TableInputFormat},
+and data sink, {@link org.apache.hadoop.hbase.mapred.TableOutputFormat TableOutputFormat}, for MapReduce jobs.
+Writing MapReduce jobs that read or write HBase, you'll probably want to subclass
+{@link org.apache.hadoop.hbase.mapred.TableMap TableMap} and/or
+{@link org.apache.hadoop.hbase.mapred.TableReduce TableReduce}. See the do-nothing
+pass-through classes {@link org.apache.hadoop.hbase.mapred.IdentityTableMap IdentityTableMap} and
+{@link org.apache.hadoop.hbase.mapred.IdentityTableReduce IdentityTableReduce} for basic usage. For a more
+involved example, see {@link org.apache.hadoop.hbase.mapred.BuildTableIndex BuildTableIndex}
+or review the org.apache.hadoop.hbase.mapred.TestTableMapReduce
unit test.
+
Running mapreduce jobs that have hbase as source or sink, you'll need to +specify source/sink table and column names in your configuration.
+ +Reading from hbase, the !TableInputFormat asks hbase for the list of +regions and makes a map-per-region. Writing, its better to have lots of +reducers so load is spread across the hbase cluster. +
+ +Read the class comment below for specification of inputs, prerequisites, etc. +
++*/ +package org.apache.hadoop.hbase.mapred;package org.apache.hadoop.hbase.mapred; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.MapWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +/** + * Sample uploader. + * + * This is EXAMPLE code. You will need to change it to work for your context. + * + * Uses TableReduce to put the data into hbase. Change the InputFormat to suit + * your data. Use the map to massage the input so it fits hbase. Currently its + * just a pass-through map. In the reduce, you need to output a row and a + * map of columns to cells. Change map and reduce to suit your input. + * + * <p>The below is wired up to handle an input whose format is a text file + * which has a line format as follow: + * <pre> + * row columnname columndata + * </pre> + * + * <p>The table and columnfamily we're to insert into must preexist. + * + * <p> To run, edit your hadoop-env.sh and add hbase classes and conf to your + * HADOOP_CLASSPATH. For example: + * <pre> + * export HADOOP_CLASSPATH=/Users/stack/Documents/checkouts/hbase/branches/0.1/build/classes:/Users/stack/Documents/checkouts/hbase/branches/0.1/conf + * </pre> + * <p>Restart your MR cluster after making the following change (You need to + * be running in pseudo-distributed mode at a minimum for the hadoop to see + * the above additions to your CLASSPATH). + * + * <p>Start up your hbase cluster. + * + * <p>Next do the following to start the MR job: + * <pre> + * ./bin/hadoop org.apache.hadoop.hbase.mapred.SampleUploader /tmp/input.txt TABLE_NAME + * </pre> + * + * <p>This code was written against hbase 0.1 branch. + *' +public class SampleUploader extends MapReduceBase +implements Mapper, Tool { + private static final String NAME = "SampleUploader"; + private Configuration conf; + + public JobConf createSubmittableJob(String[] args) { + JobConf c = new JobConf(getConf(), SampleUploader.class); + c.setJobName(NAME); + c.setInputPath(new Path(args[0])); + c.setMapperClass(this.getClass()); + c.setMapOutputKeyClass(Text.class); + c.setMapOutputValueClass(MapWritable.class); + c.setReducerClass(TableUploader.class); + TableReduce.initJob(args[1], TableUploader.class, c); + return c; + } + + public void map(LongWritable k, Text v, + OutputCollector output, Reporter r) + throws IOException { + // Lines are space-delimited; first item is row, next the columnname and + // then the third the cell value. + String tmp = v.toString(); + if (tmp.length() == 0) { + return; + } + String [] splits = v.toString().split(" "); + MapWritable mw = new MapWritable(); + mw.put(new Text(splits[1]), + new ImmutableBytesWritable(splits[2].getBytes())); + String row = splits[0]; + r.setStatus("Map emitting " + row + " for record " + k.toString()); + output.collect(new Text(row), mw); + } + + public static class TableUploader + extends TableReduce<Text, MapWritable> { + @Override + public void reduce(Text k, Iterator v, + OutputCollector output, Reporter r) + throws IOException { + while (v.hasNext()) { + r.setStatus("Reducer committing " + k); + output.collect(k, v.next()); + } + } + } + + static int printUsage() { + System.out.println(NAME + "<input> <table_name>"); + return -1; + } + + public int run(@SuppressWarnings("unused") String[] args) throws Exception { + // Make sure there are exactly 2 parameters left. + if (args.length != 2) { + System.out.println("ERROR: Wrong number of parameters: " + + args.length + " instead of 2."); + return printUsage(); + } + JobClient.runJob(createSubmittableJob(args)); + return 0; + } + + public Configuration getConf() { + return this.conf; + } + + public void setConf(final Configuration c) { + this.conf = c; + } + + public static void main(String[] args) throws Exception { + int errCode = ToolRunner.run(new Configuration(), new SampleUploader(), + args); + System.exit(errCode); + } +} +