diff --git a/CHANGES.txt b/CHANGES.txt index abaafd62a34..f9410d0ce8d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -8,6 +8,7 @@ Release 0.21.0 - Unreleased HBASE-1737 Regions unbalanced when adding new node (recommit) HBASE-1792 [Regression] Cannot save timestamp in the future HBASE-1793 [Regression] HTable.get/getRow with a ts is broken + HBASE-1698 Review documentation for o.a.h.h.mapreduce IMPROVEMENTS HBASE-1760 Cleanup TODOs in HTable diff --git a/src/java/org/apache/hadoop/hbase/mapreduce/package-info.java b/src/java/org/apache/hadoop/hbase/mapreduce/package-info.java index 2158af480e7..528ddfbfeb8 100644 --- a/src/java/org/apache/hadoop/hbase/mapreduce/package-info.java +++ b/src/java/org/apache/hadoop/hbase/mapreduce/package-info.java @@ -33,41 +33,34 @@ Input/OutputFormats, a table indexing MapReduce job, and utility
MapReduce jobs deployed to a MapReduce cluster do not by default have access
to the HBase configuration under $HBASE_CONF_DIR
nor to HBase classes.
You could add hbase-site.xml
to $HADOOP_HOME/conf and add
-hbase-X.X.X.jar
to the $HADOOP_HOME/lib
and copy these
-changes across your cluster but the cleanest means of adding hbase configuration
+hbase jars to the $HADOOP_HOME/lib
and copy these
+changes across your cluster but a cleaner means of adding hbase configuration
and classes to the cluster CLASSPATH
is by uncommenting
HADOOP_CLASSPATH
in $HADOOP_HOME/conf/hadoop-env.sh
-and adding the path to the hbase jar and $HBASE_CONF_DIR
directory.
-Then copy the amended configuration around the cluster.
-You'll probably need to restart the MapReduce cluster if you want it to notice
-the new configuration.
-
For example, here is how you would amend Expand After copying the above change around your cluster, this is how you would run
-the PerformanceEvaluation MR job to put up 4 clients (Presumes a ready mapreduce
-cluster):
+ After copying the above change around your cluster (and restarting), this is
+how you would run the PerformanceEvaluation MR job to put up 4 clients (Presumes
+a ready mapreduce cluster):
hadoop-env.sh
adding the
-built hbase jar, hbase conf, and the PerformanceEvaluation
class from
-the built hbase test jar to the hadoop CLASSPATH
:
+adding hbase dependencies here. For example, here is how you would amend
+
hadoop-env.sh
adding the
+built hbase jar, zookeeper (needed by hbase client), hbase conf, and the
+PerformanceEvaluation
class from the built hbase test jar to the
+hadoop CLASSPATH
:
+export HADOOP_CLASSPATH=$HBASE_HOME/build/hbase-X.X.X.jar:$HBASE_HOME/build/hbase-X.X.X-test.jar:$HBASE_HOME/conf:${HBASE_HOME}/lib/zookeeper-X.X.X.jar
# Extra Java CLASSPATH elements. Optional.
# export HADOOP_CLASSPATH=
-export HADOOP_CLASSPATH=$HBASE_HOME/build/test:$HBASE_HOME/build/hbase-X.X.X.jar:$HBASE_HOME/build/hbase-X.X.X-test.jar:$HBASE_HOME/conf
$HBASE_HOME
in the above appropriately to suit your
local environment.
-
-The PerformanceEvaluation class wil be found on the CLASSPATH because you
-added $HADOOP_HOME/bin/hadoop org.apache.hadoop.hbase.PerformanceEvaluation sequentialWrite 4
$HBASE_HOME/build/test
to HADOOP_CLASSPATH
Another possibility, if for example you do not have access to hadoop-env.sh or
-are unable to restart the hadoop cluster, is bundling the hbase jar into a mapreduce
+are unable to restart the hadoop cluster, is bundling the hbase jars into a mapreduce
job jar adding it and its dependencies under the job jar lib/
-directory and the hbase conf into a job jar conf/
directory.
+directory and the hbase conf into the job jars top-level directory.
org.apache.hadoop.hbase.mapreduce.TestTableMapReduce
unit test.
@@ -106,162 +99,22 @@ to have lots of reducers so load is spread across the hbase cluster.
currently existing regions. The
{@link org.apache.hadoop.hbase.mapreduce.HRegionPartitioner} is suitable
when your table is large and your upload is not such that it will greatly
-alter the number of existing regions when done; other use the default
+alter the number of existing regions when done; otherwise use the default
partitioner.
See {@link org.apache.hadoop.hbase.mapreduce.RowCounter}. You should be able to run +
See {@link org.apache.hadoop.hbase.mapreduce.RowCounter}. This job uses
+{@link org.apache.hadoop.hbase.mapreduce.TableInputFormat TableInputFormat} and
+does a count of all rows in specified table.
+You should be able to run
it by doing: % ./bin/hadoop jar hbase-X.X.X.jar
. This will invoke
the hbase MapReduce Driver class. Select 'rowcounter' from the choice of jobs
-offered. You may need to add the hbase conf directory to $HADOOP_HOME/conf/hadoop-env.sh#HADOOP_CLASSPATH
+offered. This will emit rowcouner 'usage'. Specify tablename, column to count
+and output directory. You may need to add the hbase conf directory to $HADOOP_HOME/conf/hadoop-env.sh#HADOOP_CLASSPATH
so the rowcounter gets pointed at the right hbase cluster (or, build a new jar
with an appropriate hbase-site.xml built into your job jar).
See org.apache.hadoop.hbase.PerformanceEvaluation from hbase src/test. It runs -a mapreduce job to run concurrent clients reading and writing hbase. -
- -A students/classes example based on a contribution by Naama Kraus with logs of
-documentation can be found over in src/examples/mapred.
-Its the org.apache.hadoop.hbase.mapreduce.SampleUploader
class.
-Just copy it under src/java/org/apache/hadoop/hbase/mapred to compile and try it
-(until we start generating an hbase examples jar). The class reads a data file
-from HDFS and per line, does an upload to HBase using TableReduce.
-Read the class comment for specification of inputs, prerequisites, etc.
-
Here's a sample program from -Allen Day -that takes an HDFS text file path and an HBase table name as inputs, and loads the contents of the text file to the table -all up in the map phase. -
- -- */ package org.apache.hadoop.hbase.mapreduce;-package com.spicylogic.hbase; -package org.apache.hadoop.hbase.mapreduce; -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.io.BatchUpdate; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.JobClient; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.MapReduceBase; -import org.apache.hadoop.mapred.Mapper; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.lib.NullOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - -/** - * Class that adds the parsed line from the input to hbase - * in the map function. Map has no emissions and job - * has no reduce. - */ -public class BulkImport implements Tool { - private static final String NAME = "BulkImport"; - private Configuration conf; - - public static class InnerMap extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { - private HTable table; - private HBaseConfiguration HBconf; - - public void map(LongWritable key, Text value, - OutputCollector<Text, Text> output, Reporter reporter) - throws IOException { - if ( table == null ) - throw new IOException("table is null"); - - // Split input line on tab character - String [] splits = value.toString().split("\t"); - if ( splits.length != 4 ) - return; - - String rowID = splits[0]; - int timestamp = Integer.parseInt( splits[1] ); - String colID = splits[2]; - String cellValue = splits[3]; - - reporter.setStatus("Map emitting cell for row='" + rowID + - "', column='" + colID + "', time='" + timestamp + "'"); - - BatchUpdate bu = new BatchUpdate( rowID ); - if ( timestamp > 0 ) - bu.setTimestamp( timestamp ); - - bu.put(colID, cellValue.getBytes()); - table.commit( bu ); - } - - public void configure(JobConf job) { - HBconf = new HBaseConfiguration(job); - try { - table = new HTable( HBconf, job.get("input.table") ); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - } - - public JobConf createSubmittableJob(String[] args) { - JobConf c = new JobConf(getConf(), BulkImport.class); - c.setJobName(NAME); - FileInputFormat.setInputPaths(c, new Path(args[0])); - - c.set("input.table", args[1]); - c.setMapperClass(InnerMap.class); - c.setNumReduceTasks(0); - c.setOutputFormat(NullOutputFormat.class); - return c; - } - - static int printUsage() { - System.err.println("Usage: " + NAME + " <input> <table_name>"); - System.err.println("\twhere <input> is a tab-delimited text file with 4 columns."); - System.err.println("\t\tcolumn 1 = row ID"); - System.err.println("\t\tcolumn 2 = timestamp (use a negative value for current time)"); - System.err.println("\t\tcolumn 3 = column ID"); - System.err.println("\t\tcolumn 4 = cell value"); - return -1; - } - - public int run(@SuppressWarnings("unused") String[] args) throws Exception { - // Make sure there are exactly 3 parameters left. - if (args.length != 2) { - return printUsage(); - } - JobClient.runJob(createSubmittableJob(args)); - return 0; - } - - public Configuration getConf() { - return this.conf; - } - - public void setConf(final Configuration c) { - this.conf = c; - } - - public static void main(String[] args) throws Exception { - int errCode = ToolRunner.run(new Configuration(), new BulkImport(), args); - System.exit(errCode); - } -} -