From 4c0ff368a21d7f334349dfbf9ab91c9d10a57b6e Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Tue, 15 Mar 2011 22:23:12 +0000 Subject: [PATCH] Use xinclude for chapters git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1081966 13f79535-47bb-0310-9956-ffa450edef68 --- src/docbkx/book.xml | 1315 +------------------------------- src/docbkx/configuration.xml | 291 +++++++ src/docbkx/getting_started.xml | 853 +++++++++++++++++++++ src/docbkx/performance.xml | 39 + src/docbkx/preface.xml | 27 + src/docbkx/shell.xml | 89 +++ src/docbkx/upgrading.xml | 55 ++ 7 files changed, 1362 insertions(+), 1307 deletions(-) create mode 100644 src/docbkx/configuration.xml create mode 100644 src/docbkx/getting_started.xml create mode 100644 src/docbkx/performance.xml create mode 100644 src/docbkx/preface.xml create mode 100644 src/docbkx/shell.xml create mode 100644 src/docbkx/upgrading.xml diff --git a/src/docbkx/book.xml b/src/docbkx/book.xml index e4b9c6a9491..2ec5c9e61a7 100644 --- a/src/docbkx/book.xml +++ b/src/docbkx/book.xml @@ -62,1285 +62,14 @@ - - Preface + + + + + + - This book aims to be the official guide for the HBase version it ships with. - This document describes HBase version . - Herein you will find either the definitive documentation on an HBase topic - as of its standing when the referenced HBase version shipped, or - this book will point to the location in javadoc, - JIRA - or wiki - where the pertinent information can be found. - This book is a work in progress. It is lacking in many areas but we - hope to fill in the holes with time. Feel free to add to this book should - by adding a patch to an issue up in the HBase JIRA. - - - - Getting Started -
- Introduction - - Quick Start will get you up and running - on a single-node instance of HBase using the local filesystem. - The Not-so-quick Start Guide - describes setup of HBase in distributed mode running on top of HDFS. - -
- -
- Quick Start - - This guide describes setup of a standalone HBase - instance that uses the local filesystem. It leads you - through creating a table, inserting rows via the - HBase Shell, and then cleaning up and shutting - down your standalone HBase instance. - The below exercise should take no more than - ten minutes (not including download time). - - -
- Download and unpack the latest stable release. - - Choose a download site from this list of Apache - Download Mirrors. Click on suggested top link. This will take you to a - mirror of HBase Releases. Click on - the folder named stable and then download the - file that ends in .tar.gz to your local filesystem; - e.g. hbase-.tar.gz. - - Decompress and untar your download and then change into the - unpacked directory. - - $ tar xfz hbase-.tar.gz -$ cd hbase- - - - - At this point, you are ready to start HBase. But before starting it, - you might want to edit conf/hbase-site.xml - and set the directory you want HBase to write to, - hbase.rootdir. - - - - - - hbase.rootdir - file:///DIRECTORY/hbase - - -]]> - -Replace DIRECTORY in the above with a path to a directory where you want -HBase to store its data. By default, hbase.rootdir is -set to /tmp/hbase-${user.name} -which means you'll lose all your data whenever your server reboots -(Most operating systems clear /tmp on restart). - -
-
-Start HBase - - Now start HBase:$ ./bin/start-hbase.sh -starting Master, logging to logs/hbase-user-master-example.org.out - - You should - now have a running standalone HBase instance. In standalone mode, HBase runs - all daemons in the the one JVM; i.e. both the HBase and ZooKeeper daemons. - HBase logs can be found in the logs subdirectory. Check them - out especially if HBase had trouble starting. - - - Is <application>java</application> installed? - All of the above presumes a 1.6 version of Oracle - java is installed on your - machine and available on your path; i.e. when you type - java, you see output that describes the options - the java program takes (HBase requires java 6). If this is - not the case, HBase will not start. - Install java, edit conf/hbase-env.sh, uncommenting the - JAVA_HOME line pointing it to your java install. Then, - retry the steps above. - -
- - -
- Shell Exercises - Connect to your running HBase via the - HBase Shell. - - $ ./bin/hbase shell -HBase Shell; enter 'help<RETURN>' for list of supported commands. -Type "exit<RETURN>" to leave the HBase Shell -Version: 0.89.20100924, r1001068, Fri Sep 24 13:55:42 PDT 2010 - -hbase(main):001:0> - - Type help and then <RETURN> - to see a listing of shell - commands and options. Browse at least the paragraphs at the end of - the help emission for the gist of how variables and command - arguments are entered into the - HBase shell; in particular note how table names, rows, and - columns, etc., must be quoted. - - Create a table named test with a single - column family named cf. - Verify its creation by listing all tables and then insert some - values. - hbase(main):003:0> create 'test', 'cf' -0 row(s) in 1.2200 seconds -hbase(main):003:0> list 'table' -test -1 row(s) in 0.0550 seconds -hbase(main):004:0> put 'test', 'row1', 'cf:a', 'value1' -0 row(s) in 0.0560 seconds -hbase(main):005:0> put 'test', 'row2', 'cf:b', 'value2' -0 row(s) in 0.0370 seconds -hbase(main):006:0> put 'test', 'row3', 'cf:c', 'value3' -0 row(s) in 0.0450 seconds - - Above we inserted 3 values, one at a time. The first insert is at - row1, column cf:a with a value of - value1. - Columns in HBase are comprised of a - column family prefix - -- cf in this example -- followed by - a colon and then a column qualifier suffix (a in this case). - - - Verify the data insert. - - Run a scan of the table by doing the following - - hbase(main):007:0> scan 'test' -ROW COLUMN+CELL -row1 column=cf:a, timestamp=1288380727188, value=value1 -row2 column=cf:b, timestamp=1288380738440, value=value2 -row3 column=cf:c, timestamp=1288380747365, value=value3 -3 row(s) in 0.0590 seconds - - Get a single row as follows - - hbase(main):008:0> get 'test', 'row1' -COLUMN CELL -cf:a timestamp=1288380727188, value=value1 -1 row(s) in 0.0400 seconds - - Now, disable and drop your table. This will clean up all - done above. - - hbase(main):012:0> disable 'test' -0 row(s) in 1.0930 seconds -hbase(main):013:0> drop 'test' -0 row(s) in 0.0770 seconds - - Exit the shell by typing exit. - - hbase(main):014:0> exit -
- -
- Stopping HBase - Stop your hbase instance by running the stop script. - - $ ./bin/stop-hbase.sh -stopping hbase............... -
- -
Where to go next - - The above described standalone setup is good for testing and experiments only. - Move on to the next section, the Not-so-quick Start Guide - where we'll go into depth on the different HBase run modes, requirements and critical - configurations needed setting up a distributed HBase deploy. - -
-
- -
- Not-so-quick Start Guide - -
Requirements - HBase has the following requirements. Please read the - section below carefully and ensure that all requirements have been - satisfied. Failure to do so will cause you (and us) grief debugging - strange errors and/or data loss. - - -
java - - Just like Hadoop, HBase requires java 6 from Oracle. -Usually you'll want to use the latest version available except the problematic u18 (u22 is the latest version as of this writing). -
- -
<link xlink:href="http://hadoop.apache.org">hadoop</link><indexterm><primary>Hadoop</primary></indexterm> -This version of HBase will only run on Hadoop 0.20.x. - It will not run on hadoop 0.21.x (nor 0.22.x) as of this writing. - HBase will lose data unless it is running on an HDFS that has a - durable sync. Currently only the - branch-0.20-append - branch has this attribute - - - See CHANGES.txt - in branch-0.20-append to see list of patches involved adding append on the Hadoop 0.20 branch. - - . - No official releases have been made from this branch up to now - so you will have to build your own Hadoop from the tip of this branch. - Scroll down in the Hadoop How To Release to the section - Build Requirements for instruction on how to build Hadoop. - - - - Or rather than build your own, you could use - Cloudera's CDH3. - CDH has the 0.20-append patches needed to add a durable sync (CDH3 is still in beta. - Either CDH3b2 or CDH3b3 will suffice). - - - Because HBase depends on Hadoop, it bundles an instance of - the Hadoop jar under its lib directory. - The bundled Hadoop was made from the Apache branch-0.20-append branch - at the time of this HBase's release. - It is critical that the version of Hadoop that is - out on your cluster matches what is Hbase match. Replace the hadoop - jar found in the HBase lib directory with the - hadoop jar you are running out on your cluster to avoid version mismatch issues. - Make sure you replace the jar all over your cluster. - For example, versions of CDH do not have HDFS-724 whereas - Hadoops branch-0.20-append branch does have HDFS-724. This - patch changes the RPC version because protocol was changed. - Version mismatch issues have various manifestations but often all looks like its hung up. - - - Can I just replace the jar in Hadoop 0.20.2 tarball with the <emphasis>sync</emphasis>-supporting Hadoop jar found in HBase? - - You could do this. It works going by a recent posting up on the - mailing list. - - - Hadoop Security - HBase will run on any Hadoop 0.20.x that incorporates Hadoop security features -- e.g. Y! 0.20S or CDH3B3 -- as long - as you do as suggested above and replace the Hadoop jar that ships with HBase with the secure version. - - - -
-
ssh -ssh must be installed and sshd must -be running to use Hadoop's scripts to manage remote Hadoop and HBase daemons. - You must be able to ssh to all nodes, including your local node, using passwordless login (Google "ssh passwordless login"). - -
-
DNS - HBase uses the local hostname to self-report it's IP address. Both forward and reverse DNS resolving should work. - If your machine has multiple interfaces, HBase will use the interface that the primary hostname resolves to. - If this is insufficient, you can set hbase.regionserver.dns.interface to indicate the primary interface. - This only works if your cluster - configuration is consistent and every host has the same network interface configuration. - Another alternative is setting hbase.regionserver.dns.nameserver to choose a different nameserver than the - system wide default. -
-
NTP - - The clocks on cluster members should be in basic alignments. Some skew is tolerable but - wild skew could generate odd behaviors. Run NTP - on your cluster, or an equivalent. - - If you are having problems querying data, or "weird" cluster operations, check system time! -
- - -
- <varname>ulimit</varname><indexterm><primary>ulimit</primary></indexterm> - HBase is a database, it uses a lot of files at the same time. - The default ulimit -n of 1024 on *nix systems is insufficient. - Any significant amount of loading will lead you to - FAQ: Why do I see "java.io.IOException...(Too many open files)" in my logs?. - You may also notice errors such as - - 2010-04-06 03:04:37,542 INFO org.apache.hadoop.hdfs.DFSClient: Exception increateBlockOutputStream java.io.EOFException - 2010-04-06 03:04:37,542 INFO org.apache.hadoop.hdfs.DFSClient: Abandoning block blk_-6935524980745310745_1391901 - - Do yourself a favor and change the upper bound on the number of file descriptors. - Set it to north of 10k. See the above referenced FAQ for how. - To be clear, upping the file descriptors for the user who is - running the HBase process is an operating system configuration, not an - HBase configuration. Also, a common mistake is that administrators - will up the file descriptors for a particular user but for whatever reason, - HBase will be running as some one else. HBase prints in its logs - as the first line the ulimit its seeing. Ensure its correct. - - A useful read setting config on you hadoop cluster is Aaron Kimballs' - Configuration Parameters: What can you just ignore? - - - -
- <varname>ulimit</varname> on Ubuntu - - If you are on Ubuntu you will need to make the following changes: - - In the file /etc/security/limits.conf add a line like: - hadoop - nofile 32768 - Replace hadoop - with whatever user is running Hadoop and HBase. If you have - separate users, you will need 2 entries, one for each user. - - - In the file /etc/pam.d/common-session add as the last line in the file: - session required pam_limits.so - Otherwise the changes in /etc/security/limits.conf won't be applied. - - - Don't forget to log out and back in again for the changes to take effect! - -
-
- -
- <varname>dfs.datanode.max.xcievers</varname><indexterm><primary>xcievers</primary></indexterm> - - An Hadoop HDFS datanode has an upper bound on the number of files - that it will serve at any one time. - The upper bound parameter is called - xcievers (yes, this is misspelled). Again, before - doing any loading, make sure you have configured - Hadoop's conf/hdfs-site.xml - setting the xceivers value to at least the following: - - <property> - <name>dfs.datanode.max.xcievers</name> - <value>4096</value> - </property> - - - Be sure to restart your HDFS after making the above - configuration. - Not having this configuration in place makes for strange looking - failures. Eventually you'll see a complain in the datanode logs - complaining about the xcievers exceeded, but on the run up to this - one manifestation is complaint about missing blocks. For example: - 10/12/08 20:10:31 INFO hdfs.DFSClient: Could not obtain block blk_XXXXXXXXXXXXXXXXXXXXXX_YYYYYYYY from any node: java.io.IOException: No live nodes contain current block. Will get new block locations from namenode and retry... - -
- -
-Windows - -HBase has been little tested running on windows. -Running a production install of HBase on top of -windows is not recommended. - - -If you are running HBase on Windows, you must install -Cygwin -to have a *nix-like environment for the shell scripts. The full details -are explained in the Windows Installation -guide. - -
- -
- -
HBase run modes: Standalone and Distributed - HBase has two run modes: standalone - and distributed. - Out of the box, HBase runs in standalone mode. To set up a - distributed deploy, you will need to configure HBase by editing - files in the HBase conf directory. - -Whatever your mode, you will need to edit conf/hbase-env.sh -to tell HBase which java to use. In this file -you set HBase environment variables such as the heapsize and other options -for the JVM, the preferred location for log files, etc. -Set JAVA_HOME to point at the root of your -java install. - -
Standalone HBase - This is the default mode. Standalone mode is - what is described in the quickstart - section. In standalone mode, HBase does not use HDFS -- it uses the local - filesystem instead -- and it runs all HBase daemons and a local zookeeper - all up in the same JVM. Zookeeper binds to a well known port so clients may - talk to HBase. - -
-
Distributed - Distributed mode can be subdivided into distributed but all daemons run on a - single node -- a.k.a pseudo-distributed-- and - fully-distributed where the daemons - are spread across all nodes in the cluster - The pseudo-distributed vs fully-distributed nomenclature comes from Hadoop.. - - Distributed modes require an instance of the - Hadoop Distributed File System (HDFS). See the - Hadoop - requirements and instructions for how to set up a HDFS. - Before proceeding, ensure you have an appropriate, working HDFS. - - Below we describe the different distributed setups. - Starting, verification and exploration of your install, whether a - pseudo-distributed or fully-distributed - configuration is described in a section that follows, - Running and Confirming your Installation. - The same verification script applies to both deploy types. - -
Pseudo-distributed -A pseudo-distributed mode is simply a distributed mode run on a single host. -Use this configuration testing and prototyping on HBase. Do not use this configuration -for production nor for evaluating HBase performance. - -Once you have confirmed your HDFS setup, -edit conf/hbase-site.xml. This is the file -into which you add local customizations and overrides for -Default HBase Configurations -and HDFS Client Configurations. -Point HBase at the running Hadoop HDFS instance by setting the -hbase.rootdir property. -This property points HBase at the Hadoop filesystem instance to use. -For example, adding the properties below to your -hbase-site.xml says that HBase -should use the /hbase -directory in the HDFS whose namenode is at port 9000 on your local machine, and that -it should run with one replica only (recommended for pseudo-distributed mode): - -<configuration> - ... - <property> - <name>hbase.rootdir</name> - <value>hdfs://localhost:9000/hbase</value> - <description>The directory shared by region servers. - </description> - </property> - <property> - <name>dfs.replication</name> - <value>1</value> - <description>The replication count for HLog & HFile storage. Should not be greater than HDFS datanode count. - </description> - </property> - ... -</configuration> - - - -Let HBase create the hbase.rootdir -directory. If you don't, you'll get warning saying HBase -needs a migration run because the directory is missing files -expected by HBase (it'll create them if you let it). - - - -Above we bind to localhost. -This means that a remote client cannot -connect. Amend accordingly, if you want to -connect from a remote location. - - -Now skip to Running and Confirming your Installation -for how to start and verify your pseudo-distributed install. - - - See Pseudo-distributed mode extras -for notes on how to start extra Masters and regionservers when running - pseudo-distributed. - - - -
- -
Fully-distributed - -For running a fully-distributed operation on more than one host, make -the following configurations. In hbase-site.xml, -add the property hbase.cluster.distributed -and set it to true and point the HBase -hbase.rootdir at the appropriate -HDFS NameNode and location in HDFS where you would like -HBase to write data. For example, if you namenode were running -at namenode.example.org on port 9000 and you wanted to home -your HBase in HDFS at /hbase, -make the following configuration. - -<configuration> - ... - <property> - <name>hbase.rootdir</name> - <value>hdfs://namenode.example.org:9000/hbase</value> - <description>The directory shared by region servers. - </description> - </property> - <property> - <name>hbase.cluster.distributed</name> - <value>true</value> - <description>The mode the cluster will be in. Possible values are - false: standalone and pseudo-distributed setups with managed Zookeeper - true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh) - </description> - </property> - ... -</configuration> - - -
<filename>regionservers</filename> -In addition, a fully-distributed mode requires that you -modify conf/regionservers. -The regionservers file lists all hosts -that you would have running HRegionServers, one host per line -(This file in HBase is like the Hadoop slaves file). All servers -listed in this file will be started and stopped when HBase cluster start or stop is run. -
- -
ZooKeeper<indexterm><primary>ZooKeeper</primary></indexterm> -A distributed HBase depends on a running ZooKeeper cluster. -All participating nodes and clients -need to be able to access the running ZooKeeper ensemble. -HBase by default manages a ZooKeeper "cluster" for you. -It will start and stop the ZooKeeper ensemble as part of -the HBase start/stop process. You can also manage -the ZooKeeper ensemble independent of HBase and -just point HBase at the cluster it should use. -To toggle HBase management of ZooKeeper, -use the HBASE_MANAGES_ZK variable in -conf/hbase-env.sh. -This variable, which defaults to true, tells HBase whether to -start/stop the ZooKeeper ensemble servers as part of HBase start/stop. - -When HBase manages the ZooKeeper ensemble, you can specify ZooKeeper configuration -using its native zoo.cfg file, or, the easier option -is to just specify ZooKeeper options directly in conf/hbase-site.xml. -A ZooKeeper configuration option can be set as a property in the HBase -hbase-site.xml -XML configuration file by prefacing the ZooKeeper option name with -hbase.zookeeper.property. -For example, the clientPort setting in ZooKeeper can be changed by -setting the hbase.zookeeper.property.clientPort property. - -For all default values used by HBase, including ZooKeeper configuration, -see the section -Default HBase Configurations. -Look for the hbase.zookeeper.property prefix - -For the full list of ZooKeeper configurations, -see ZooKeeper's zoo.cfg. -HBase does not ship with a zoo.cfg so you will need to -browse the conf directory in an appropriate ZooKeeper download. - - - - - - -You must at least list the ensemble servers in hbase-site.xml -using the hbase.zookeeper.quorum property. -This property defaults to a single ensemble member at -localhost which is not suitable for a -fully distributed HBase. (It binds to the local machine only and remote clients -will not be able to connect). - -How many ZooKeepers should I run? - -You can run a ZooKeeper ensemble that comprises 1 node only but -in production it is recommended that you run a ZooKeeper ensemble of -3, 5 or 7 machines; the more members an ensemble has, the more -tolerant the ensemble is of host failures. Also, run an odd number of machines. -There can be no quorum if the number of members is an even number. Give each -ZooKeeper server around 1GB of RAM, and if possible, its own dedicated disk -(A dedicated disk is the best thing you can do to ensure a performant ZooKeeper -ensemble). For very heavily loaded clusters, run ZooKeeper servers on separate machines from -RegionServers (DataNodes and TaskTrackers). - - - - -For example, to have HBase manage a ZooKeeper quorum on nodes -rs{1,2,3,4,5}.example.com, bound to port 2222 (the default is 2181) -ensure HBASE_MANAGE_ZK is commented out or set to -true in conf/hbase-env.sh and -then edit conf/hbase-site.xml and set -hbase.zookeeper.property.clientPort -and -hbase.zookeeper.quorum. You should also -set -hbase.zookeeper.property.dataDir -to other than the default as the default has ZooKeeper persist data under -/tmp which is often cleared on system restart. -In the example below we have ZooKeeper persist to /user/local/zookeeper. - - <configuration> - ... - <property> - <name>hbase.zookeeper.property.clientPort</name> - <value>2222</value> - <description>Property from ZooKeeper's config zoo.cfg. - The port at which the clients will connect. - </description> - </property> - <property> - <name>hbase.zookeeper.quorum</name> - <value>rs1.example.com,rs2.example.com,rs3.example.com,rs4.example.com,rs5.example.com</value> - <description>Comma separated list of servers in the ZooKeeper Quorum. - For example, "host1.mydomain.com,host2.mydomain.com,host3.mydomain.com". - By default this is set to localhost for local and pseudo-distributed modes - of operation. For a fully-distributed setup, this should be set to a full - list of ZooKeeper quorum servers. If HBASE_MANAGES_ZK is set in hbase-env.sh - this is the list of servers which we will start/stop ZooKeeper on. - </description> - </property> - <property> - <name>hbase.zookeeper.property.dataDir</name> - <value>/usr/local/zookeeper</value> - <description>Property from ZooKeeper's config zoo.cfg. - The directory where the snapshot is stored. - </description> - </property> - ... - </configuration> - - -
Using existing ZooKeeper ensemble -To point HBase at an existing ZooKeeper cluster, -one that is not managed by HBase, -set HBASE_MANAGES_ZK in -conf/hbase-env.sh to false - - ... - # Tell HBase whether it should manage it's own instance of Zookeeper or not. - export HBASE_MANAGES_ZK=false - -Next set ensemble locations and client port, if non-standard, -in hbase-site.xml, -or add a suitably configured zoo.cfg to HBase's CLASSPATH. -HBase will prefer the configuration found in zoo.cfg -over any settings in hbase-site.xml. - - -When HBase manages ZooKeeper, it will start/stop the ZooKeeper servers as a part -of the regular start/stop scripts. If you would like to run ZooKeeper yourself, -independent of HBase start/stop, you would do the following - -${HBASE_HOME}/bin/hbase-daemons.sh {start,stop} zookeeper - - -Note that you can use HBase in this manner to spin up a ZooKeeper cluster, -unrelated to HBase. Just make sure to set HBASE_MANAGES_ZK to -false if you want it to stay up across HBase restarts -so that when HBase shuts down, it doesn't take ZooKeeper down with it. - -For more information about running a distinct ZooKeeper cluster, see -the ZooKeeper Getting Started Guide. - -
-
- -
-HDFS Client Configuration -Of note, if you have made HDFS client configuration on your Hadoop cluster --- i.e. configuration you want HDFS clients to use as opposed to server-side configurations -- -HBase will not see this configuration unless you do one of the following: - - Add a pointer to your HADOOP_CONF_DIR - to the HBASE_CLASSPATH environment variable - in hbase-env.sh. - Add a copy of hdfs-site.xml - (or hadoop-site.xml) or, better, symlinks, - under - ${HBASE_HOME}/conf, or - if only a small set of HDFS client - configurations, add them to hbase-site.xml. - - -An example of such an HDFS client configuration is dfs.replication. If for example, -you want to run with a replication factor of 5, hbase will create files with the default of 3 unless -you do the above to make the configuration available to HBase. -
-
-
- -
Running and Confirming Your Installation -Make sure HDFS is running first. -Start and stop the Hadoop HDFS daemons by running bin/start-hdfs.sh -over in the HADOOP_HOME directory. -You can ensure it started properly by testing the put and -get of files into the Hadoop filesystem. -HBase does not normally use the mapreduce daemons. These do not need to be started. - -If you are managing your own ZooKeeper, start it -and confirm its running else, HBase will start up ZooKeeper for you as part -of its start process. - -Start HBase with the following command: -bin/start-hbase.sh -Run the above from the HBASE_HOME directory. - -You should now have a running HBase instance. -HBase logs can be found in the logs subdirectory. Check them -out especially if HBase had trouble starting. - -HBase also puts up a UI listing vital attributes. By default its deployed on the Master host -at port 60010 (HBase RegionServers listen on port 60020 by default and put up an informational -http server at 60030). If the Master were running on a host named master.example.org -on the default port, to see the Master's homepage you'd point your browser at -http://master.example.org:60010. - -Once HBase has started, see the -Shell Exercises section for how to -create tables, add data, scan your insertions, and finally disable and -drop your tables. - - -To stop HBase after exiting the HBase shell enter -$ ./bin/stop-hbase.sh -stopping hbase............... -Shutdown can take a moment to complete. It can take longer if your cluster -is comprised of many machines. If you are running a distributed operation, -be sure to wait until HBase has shut down completely -before stopping the Hadoop daemons. - - - -
-
- - - - - - -
Example Configurations -
Basic Distributed HBase Install - Here is an example basic configuration for a distributed ten node cluster. - The nodes are named example0, example1, etc., through -node example9 in this example. The HBase Master and the HDFS namenode -are running on the node example0. RegionServers run on nodes -example1-example9. -A 3-node ZooKeeper ensemble runs on example1, -example2, and example3 on the -default ports. ZooKeeper data is persisted to the directory -/export/zookeeper. -Below we show what the main configuration files --- hbase-site.xml, regionservers, and -hbase-env.sh -- found in the HBase -conf directory might look like. - -
<filename>hbase-site.xml</filename> - - - - - - hbase.zookeeper.quorum - example1,example2,example3 - The directory shared by region servers. - - - - hbase.zookeeper.property.dataDir - /export/zookeeper - Property from ZooKeeper's config zoo.cfg. - The directory where the snapshot is stored. - - - - hbase.rootdir - hdfs://example0:9000/hbase - The directory shared by region servers. - - - - hbase.cluster.distributed - true - The mode the cluster will be in. Possible values are - false: standalone and pseudo-distributed setups with managed Zookeeper - true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh) - - - -]]> - -
- -
<filename>regionservers</filename> - In this file you list the nodes that will run regionservers. In - our case we run regionservers on all but the head node - example1 which is - carrying the HBase Master and the HDFS namenode - - example1 - example3 - example4 - example5 - example6 - example7 - example8 - example9 - -
- -
<filename>hbase-env.sh</filename> - Below we use a diff to show the differences from - default in the hbase-env.sh file. Here we are setting -the HBase heap to be 4G instead of the default 1G. - - - - - - Use rsync to copy the content of - the conf directory to - all nodes of the cluster. - -
- -
- -
-
- -
- - - Upgrading - - Review the requirements - section above, in particular the section on Hadoop version. - -
- Upgrading to HBase 0.90.x from 0.20.x or 0.89.x - This version of 0.90.x HBase can be started on data written by - HBase 0.20.x or HBase 0.89.x. There is no need of a migration step. - HBase 0.89.x and 0.90.x does write out the name of region directories - differently -- it names them with a md5 hash of the region name rather - than a jenkins hash -- so this means that once started, there is no - going back to HBase 0.20.x. - - - Be sure to remove the hbase-default.xml from - your conf - directory on upgrade. A 0.20.x version of this file will have - sub-optimal configurations for 0.90.x HBase. The - hbase-default.xml file is now bundled into the - HBase jar and read from there. If you would like to review - the content of this file, see it in the src tree at - src/main/resources/hbase-default.xml or - see Default HBase Configurations. - - - Finally, if upgrading from 0.20.x, check your - .META. schema in the shell. In the past we would - recommend that users run with a 16kb - MEMSTORE_FLUSHSIZE. - Run hbase> scan '-ROOT-' in the shell. This will output - the current .META. schema. Check - MEMSTORE_FLUSHSIZE size. Is it 16kb (16384)? If so, you will - need to change this (The 'normal'/default value is 64MB (67108864)). - Run the script bin/set_meta_memstore_size.rb. - This will make the necessary edit to your .META. schema. - Failure to run this change will make for a slow cluster - - See HBASE-3499 Users upgrading to 0.90.0 need to have their .META. table updated with the right MEMSTORE_SIZE - - - . - - -
-
- - - Configuration - - HBase uses the same configuration system as Hadoop. - To configure a deploy, edit a file of environment variables - in conf/hbase-env.sh -- this configuration - is used mostly by the launcher shell scripts getting the cluster - off the ground -- and then add configuration to an XML file to - do things like override HBase defaults, tell HBase what Filesystem to - use, and the location of the ZooKeeper ensemble - - -Be careful editing XML. Make sure you close all elements. -Run your file through xmmlint or similar -to ensure well-formedness of your document after an edit session. - - - . - - - When running in distributed mode, after you make - an edit to an HBase configuration, make sure you copy the - content of the conf directory to - all nodes of the cluster. HBase will not do this for you. - Use rsync. - - -
- <filename>hbase-site.xml</filename> and <filename>hbase-default.xml</filename> - Just as in Hadoop where you add site-specific HDFS configuration - to the hdfs-site.xml file, - for HBase, site specific customizations go into - the file conf/hbase-site.xml. - For the list of configurable properties, see - Default HBase Configurations - below or view the raw hbase-default.xml - source file in the HBase source code at - src/main/resources. - - - Not all configuration options make it out to - hbase-default.xml. Configuration - that it is thought rare anyone would change can exist only - in code; the only way to turn up such configurations is - via a reading of the source code itself. - - - Changes here will require a cluster restart for HBase to notice the change. - - - -
- -
- <filename>hbase-env.sh</filename> - Set HBase environment variables in this file. - Examples include options to pass the JVM on start of - an HBase daemon such as heap size and garbarge collector configs. - You also set configurations for HBase configuration, log directories, - niceness, ssh options, where to locate process pid files, - etc., via settings in this file. Open the file at - conf/hbase-env.sh and peruse its content. - Each option is fairly well documented. Add your own environment - variables here if you want them read by HBase daemon startup. - - Changes here will require a cluster restart for HBase to notice the change. - -
- -
- <filename>log4j.properties</filename> - Edit this file to change rate at which HBase files - are rolled and to change the level at which HBase logs messages. - - - Changes here will require a cluster restart for HBase to notice the change - though log levels can be changed for particular daemons via the HBase UI. - -
- -
- The Important Configurations - Below we list the important Configurations. We've divided this section into - required configuration and worth-a-look recommended configs. - - - -
Required Configurations - See the Requirements section. - It lists at least two required configurations needed running HBase bearing - load: i.e. file descriptors ulimit and - dfs.datanode.max.xcievers. - -
- -
Recommended Configuations -
<varname>zookeeper.session.timeout</varname> - The default timeout is three minutes (specified in milliseconds). This means - that if a server crashes, it will be three minutes before the Master notices - the crash and starts recovery. You might like to tune the timeout down to - a minute or even less so the Master notices failures the sooner. - Before changing this value, be sure you have your JVM garbage collection - configuration under control otherwise, a long garbage collection that lasts - beyond the zookeeper session timeout will take out - your RegionServer (You might be fine with this -- you probably want recovery to start - on the server if a RegionServer has been in GC for a long period of time). - - To change this configuration, edit hbase-site.xml, - copy the changed file around the cluster and restart. - - We set this value high to save our having to field noob questions up on the mailing lists asking - why a RegionServer went down during a massive import. The usual cause is that their JVM is untuned and - they are running into long GC pauses. Our thinking is that - while users are getting familiar with HBase, we'd save them having to know all of its - intricacies. Later when they've built some confidence, then they can play - with configuration such as this. - -
-
<varname>hbase.regionserver.handler.count</varname> - - This setting defines the number of threads that are kept open to answer - incoming requests to user tables. The default of 10 is rather low in order to - prevent users from killing their region servers when using large write buffers - with a high number of concurrent clients. The rule of thumb is to keep this - number low when the payload per request approaches the MB (big puts, scans using - a large cache) and high when the payload is small (gets, small puts, ICVs, deletes). - - - It is safe to set that number to the - maximum number of incoming clients if their payload is small, the typical example - being a cluster that serves a website since puts aren't typically buffered - and most of the operations are gets. - - - The reason why it is dangerous to keep this setting high is that the aggregate - size of all the puts that are currently happening in a region server may impose - too much pressure on its memory, or even trigger an OutOfMemoryError. A region server - running on low memory will trigger its JVM's garbage collector to run more frequently - up to a point where GC pauses become noticeable (the reason being that all the memory - used to keep all the requests' payloads cannot be trashed, no matter how hard the - garbage collector tries). After some time, the overall cluster - throughput is affected since every request that hits that region server will take longer, - which exacerbates the problem even more. - -
-
- Configuration for large memory machines - - HBase ships with a reasonable, conservative configuration that will - work on nearly all - machine types that people might want to test with. If you have larger - machines -- HBase has 8G and larger heap -- you might the following configuration options helpful. - TODO. - - -
- -
- LZO compression<indexterm><primary>LZO</primary></indexterm> - You should consider enabling LZO compression. Its - near-frictionless and in most all cases boosts performance. - - Unfortunately, HBase cannot ship with LZO because of - the licensing issues; HBase is Apache-licensed, LZO is GPL. - Therefore LZO install is to be done post-HBase install. - See the Using LZO Compression - wiki page for how to make LZO work with HBase. - - A common problem users run into when using LZO is that while initial - setup of the cluster runs smooth, a month goes by and some sysadmin goes to - add a machine to the cluster only they'll have forgotten to do the LZO - fixup on the new machine. In versions since HBase 0.90.0, we should - fail in a way that makes it plain what the problem is, but maybe not. - Remember you read this paragraphSee - hbase.regionserver.codecs - for a feature to help protect against failed LZO install. - - See also the Compression Appendix - at the tail of this book. -
-
- Bigger Regions - - Consider going to larger regions to cut down on the total number of regions - on your cluster. Generally less Regions to manage makes for a smoother running - cluster (You can always later manually split the big Regions should one prove - hot and you want to spread the request load over the cluster). By default, - regions are 256MB in size. You could run with - 1G. Some run with even larger regions; 4G or even larger. Adjust - hbase.hregion.max.filesize in your hbase-site.xml. - -
-
- Managed Splitting - - Rather than let HBase auto-split your Regions, manage the splitting manually - What follows is taken from the javadoc at the head of - the org.apache.hadoop.hbase.util.RegionSplitter tool - added to HBase post-0.90.0 release. - - . - With growing amounts of data, splits will continually be needed. Since - you always know exactly what regions you have, long-term debugging and - profiling is much easier with manual splits. It is hard to trace the logs to - understand region level problems if it keeps splitting and getting renamed. - Data offlining bugs + unknown number of split regions == oh crap! If an - HLog or StoreFile - was mistakenly unprocessed by HBase due to a weird bug and - you notice it a day or so later, you can be assured that the regions - specified in these files are the same as the current regions and you have - less headaches trying to restore/replay your data. - You can finely tune your compaction algorithm. With roughly uniform data - growth, it's easy to cause split / compaction storms as the regions all - roughly hit the same data size at the same time. With manual splits, you can - let staggered, time-based major compactions spread out your network IO load. - - - How do I turn off automatic splitting? Automatic splitting is determined by the configuration value - hbase.hregion.max.filesize. It is not recommended that you set this - to Long.MAX_VALUE in case you forget about manual splits. A suggested setting - is 100GB, which would result in > 1hr major compactions if reached. - - What's the optimal number of pre-split regions to create? - Mileage will vary depending upon your application. - You could start low with 10 pre-split regions / server and watch as data grows - over time. It's better to err on the side of too little regions and rolling split later. - A more complicated answer is that this depends upon the largest storefile - in your region. With a growing data size, this will get larger over time. You - want the largest region to be just big enough that the Store compact - selection algorithm only compacts it due to a timed major. If you don't, your - cluster can be prone to compaction storms as the algorithm decides to run - major compactions on a large series of regions all at once. Note that - compaction storms are due to the uniform data growth, not the manual split - decision. - - If you pre-split your regions too thin, you can increase the major compaction -interval by configuring HConstants.MAJOR_COMPACTION_PERIOD. If your data size -grows too large, use the (post-0.90.0 HBase) org.apache.hadoop.hbase.util.RegionSplitter -script to perform a network IO safe rolling split -of all regions. - -
- -
- -
-
Client configuration and dependencies connecting to an HBase cluster - - - Since the HBase Master may move around, clients bootstrap by looking ZooKeeper. Thus clients - require the ZooKeeper quorum information in a hbase-site.xml that - is on their CLASSPATH. - If you are configuring an IDE to run a HBase client, you should - include the conf/ directory on your classpath. - - - Minimally, a client of HBase needs the hbase, hadoop, log4j, commons-logging, and zookeeper jars - in its CLASSPATH connecting to a cluster. - - - An example basic hbase-site.xml for client only - might look as follows: - - - - - hbase.zookeeper.quorum - example1,example2,example3 - The directory shared by region servers. - - - -]]> - - -
- -
- - - The HBase Shell - - - The HBase Shell is (J)Ruby's - IRB with some HBase particular verbs added. Anything you can do in - IRB, you should be able to do in the HBase Shell. - To run the HBase shell, - do as follows: - $ ./bin/hbase shell - - Type help and then <RETURN> - to see a listing of shell - commands and options. Browse at least the paragraphs at the end of - the help emission for the gist of how variables and command - arguments are entered into the - HBase shell; in particular note how table names, rows, and - columns, etc., must be quoted. - See Shell Exercises - for example basic shell operation. - -
Scripting - For examples scripting HBase, look in the - HBase bin directory. Look at the files - that end in *.rb. To run one of these - files, do as follows: - $ ./bin/hbase org.jruby.Main PATH_TO_SCRIPT - -
- -
Shell Tricks -
<filename>irbrc</filename> - Create an .irbrc file for yourself in your - home directory. Add customizations. A useful one is - command history so commands are save across Shell invocations: - - $ more .irbrc - require 'irb/ext/save-history' - IRB.conf[:SAVE_HISTORY] = 100 - IRB.conf[:HISTORY_FILE] = "#{ENV['HOME']}/.irb-save-history" - See the ruby documentation of - .irbrc to learn about other possible - confiurations. - -
-
LOG data to timestamp - - To convert the date '08/08/16 20:56:29' from an hbase log into a timestamp, do: - - hbase(main):021:0> import java.text.SimpleDateFormat - hbase(main):022:0> import java.text.ParsePosition - hbase(main):023:0> SimpleDateFormat.new("yy/MM/dd HH:mm:ss").parse("08/08/16 20:56:29", ParsePosition.new(0)).getTime() => 1218920189000 - - - To go the other direction: - - hbase(main):021:0> import java.util.Date - hbase(main):022:0> Date.new(1218920189000).toString() => "Sat Aug 16 20:56:29 UTC 2008" - - - To output in a format that is exactly like that of the HBase log format will take a little messing with - SimpleDateFormat. - -
-
Debug -
Shell debug switch - You can set a debug switch in the shell to see more output - -- e.g. more of the stack trace on exception -- - when you run a command: - hbase> debug <RETURN> - -
-
DEBUG log level - To enable DEBUG level logging in the shell, - launch it with the -d option. - $ ./bin/hbase shell -d - -
-
-
-
HBase and MapReduce @@ -1898,36 +627,8 @@ Tables in HBase are initially created with one region by default. For bulk impo - - Performance Tuning - Start with the wiki Performance Tuning page. - It has a general discussion of the main factors involved; RAM, compression, JVM settings, etc. - Afterward, come back here for more pointers. - -
- Java -
- The Garage Collector and HBase -
- Long GC pauses - - In his presentation, - Avoiding Full GCs with MemStore-Local Allocation Buffers, - Todd Lipcon describes two cases of stop-the-world garbage collections common in HBase, especially during loading; - CMS failure modes and old generation heap fragmentation brought. To address the first, - start the CMS earlier than default by adding -XX:CMSInitiatingOccupancyFraction - and setting it down from defaults. Start at 60 or 70 percent (The lower you bring down - the threshold, the more GCing is done, the more CPU used). To address the second - fragmentation issue, Todd added an experimental facility that must be - explicitly enabled in HBase 0.90.x (Its defaulted to be on in 0.92.x HBase). See - hbase.hregion.memstore.mslab.enabled to true in your - Configuration. See the cited slides for background and - detail. - -
-
-
-
+ + Bloom Filters diff --git a/src/docbkx/configuration.xml b/src/docbkx/configuration.xml new file mode 100644 index 00000000000..f032712b76f --- /dev/null +++ b/src/docbkx/configuration.xml @@ -0,0 +1,291 @@ + + + Configuration + + HBase uses the same configuration system as Hadoop. + To configure a deploy, edit a file of environment variables + in conf/hbase-env.sh -- this configuration + is used mostly by the launcher shell scripts getting the cluster + off the ground -- and then add configuration to an XML file to + do things like override HBase defaults, tell HBase what Filesystem to + use, and the location of the ZooKeeper ensemble + + +Be careful editing XML. Make sure you close all elements. +Run your file through xmmlint or similar +to ensure well-formedness of your document after an edit session. + + + . + + + When running in distributed mode, after you make + an edit to an HBase configuration, make sure you copy the + content of the conf directory to + all nodes of the cluster. HBase will not do this for you. + Use rsync. + + +
+ <filename>hbase-site.xml</filename> and <filename>hbase-default.xml</filename> + Just as in Hadoop where you add site-specific HDFS configuration + to the hdfs-site.xml file, + for HBase, site specific customizations go into + the file conf/hbase-site.xml. + For the list of configurable properties, see + Default HBase Configurations + below or view the raw hbase-default.xml + source file in the HBase source code at + src/main/resources. + + + Not all configuration options make it out to + hbase-default.xml. Configuration + that it is thought rare anyone would change can exist only + in code; the only way to turn up such configurations is + via a reading of the source code itself. + + + Changes here will require a cluster restart for HBase to notice the change. + + + +
+ +
+ <filename>hbase-env.sh</filename> + Set HBase environment variables in this file. + Examples include options to pass the JVM on start of + an HBase daemon such as heap size and garbarge collector configs. + You also set configurations for HBase configuration, log directories, + niceness, ssh options, where to locate process pid files, + etc., via settings in this file. Open the file at + conf/hbase-env.sh and peruse its content. + Each option is fairly well documented. Add your own environment + variables here if you want them read by HBase daemon startup. + + Changes here will require a cluster restart for HBase to notice the change. + +
+ +
+ <filename>log4j.properties</filename> + Edit this file to change rate at which HBase files + are rolled and to change the level at which HBase logs messages. + + + Changes here will require a cluster restart for HBase to notice the change + though log levels can be changed for particular daemons via the HBase UI. + +
+ +
+ The Important Configurations + Below we list the important Configurations. We've divided this section into + required configuration and worth-a-look recommended configs. + + + +
Required Configurations + See the Requirements section. + It lists at least two required configurations needed running HBase bearing + load: i.e. file descriptors ulimit and + dfs.datanode.max.xcievers. + +
+ +
Recommended Configuations +
<varname>zookeeper.session.timeout</varname> + The default timeout is three minutes (specified in milliseconds). This means + that if a server crashes, it will be three minutes before the Master notices + the crash and starts recovery. You might like to tune the timeout down to + a minute or even less so the Master notices failures the sooner. + Before changing this value, be sure you have your JVM garbage collection + configuration under control otherwise, a long garbage collection that lasts + beyond the zookeeper session timeout will take out + your RegionServer (You might be fine with this -- you probably want recovery to start + on the server if a RegionServer has been in GC for a long period of time). + + To change this configuration, edit hbase-site.xml, + copy the changed file around the cluster and restart. + + We set this value high to save our having to field noob questions up on the mailing lists asking + why a RegionServer went down during a massive import. The usual cause is that their JVM is untuned and + they are running into long GC pauses. Our thinking is that + while users are getting familiar with HBase, we'd save them having to know all of its + intricacies. Later when they've built some confidence, then they can play + with configuration such as this. + +
+
<varname>hbase.regionserver.handler.count</varname> + + This setting defines the number of threads that are kept open to answer + incoming requests to user tables. The default of 10 is rather low in order to + prevent users from killing their region servers when using large write buffers + with a high number of concurrent clients. The rule of thumb is to keep this + number low when the payload per request approaches the MB (big puts, scans using + a large cache) and high when the payload is small (gets, small puts, ICVs, deletes). + + + It is safe to set that number to the + maximum number of incoming clients if their payload is small, the typical example + being a cluster that serves a website since puts aren't typically buffered + and most of the operations are gets. + + + The reason why it is dangerous to keep this setting high is that the aggregate + size of all the puts that are currently happening in a region server may impose + too much pressure on its memory, or even trigger an OutOfMemoryError. A region server + running on low memory will trigger its JVM's garbage collector to run more frequently + up to a point where GC pauses become noticeable (the reason being that all the memory + used to keep all the requests' payloads cannot be trashed, no matter how hard the + garbage collector tries). After some time, the overall cluster + throughput is affected since every request that hits that region server will take longer, + which exacerbates the problem even more. + +
+
+ Configuration for large memory machines + + HBase ships with a reasonable, conservative configuration that will + work on nearly all + machine types that people might want to test with. If you have larger + machines -- HBase has 8G and larger heap -- you might the following configuration options helpful. + TODO. + + +
+ +
+ LZO compression<indexterm><primary>LZO</primary></indexterm> + You should consider enabling LZO compression. Its + near-frictionless and in most all cases boosts performance. + + Unfortunately, HBase cannot ship with LZO because of + the licensing issues; HBase is Apache-licensed, LZO is GPL. + Therefore LZO install is to be done post-HBase install. + See the Using LZO Compression + wiki page for how to make LZO work with HBase. + + A common problem users run into when using LZO is that while initial + setup of the cluster runs smooth, a month goes by and some sysadmin goes to + add a machine to the cluster only they'll have forgotten to do the LZO + fixup on the new machine. In versions since HBase 0.90.0, we should + fail in a way that makes it plain what the problem is, but maybe not. + Remember you read this paragraphSee + hbase.regionserver.codecs + for a feature to help protect against failed LZO install. + + See also the Compression Appendix + at the tail of this book. +
+
+ Bigger Regions + + Consider going to larger regions to cut down on the total number of regions + on your cluster. Generally less Regions to manage makes for a smoother running + cluster (You can always later manually split the big Regions should one prove + hot and you want to spread the request load over the cluster). By default, + regions are 256MB in size. You could run with + 1G. Some run with even larger regions; 4G or even larger. Adjust + hbase.hregion.max.filesize in your hbase-site.xml. + +
+
+ Managed Splitting + + Rather than let HBase auto-split your Regions, manage the splitting manually + What follows is taken from the javadoc at the head of + the org.apache.hadoop.hbase.util.RegionSplitter tool + added to HBase post-0.90.0 release. + + . + With growing amounts of data, splits will continually be needed. Since + you always know exactly what regions you have, long-term debugging and + profiling is much easier with manual splits. It is hard to trace the logs to + understand region level problems if it keeps splitting and getting renamed. + Data offlining bugs + unknown number of split regions == oh crap! If an + HLog or StoreFile + was mistakenly unprocessed by HBase due to a weird bug and + you notice it a day or so later, you can be assured that the regions + specified in these files are the same as the current regions and you have + less headaches trying to restore/replay your data. + You can finely tune your compaction algorithm. With roughly uniform data + growth, it's easy to cause split / compaction storms as the regions all + roughly hit the same data size at the same time. With manual splits, you can + let staggered, time-based major compactions spread out your network IO load. + + + How do I turn off automatic splitting? Automatic splitting is determined by the configuration value + hbase.hregion.max.filesize. It is not recommended that you set this + to Long.MAX_VALUE in case you forget about manual splits. A suggested setting + is 100GB, which would result in > 1hr major compactions if reached. + + What's the optimal number of pre-split regions to create? + Mileage will vary depending upon your application. + You could start low with 10 pre-split regions / server and watch as data grows + over time. It's better to err on the side of too little regions and rolling split later. + A more complicated answer is that this depends upon the largest storefile + in your region. With a growing data size, this will get larger over time. You + want the largest region to be just big enough that the Store compact + selection algorithm only compacts it due to a timed major. If you don't, your + cluster can be prone to compaction storms as the algorithm decides to run + major compactions on a large series of regions all at once. Note that + compaction storms are due to the uniform data growth, not the manual split + decision. + + If you pre-split your regions too thin, you can increase the major compaction +interval by configuring HConstants.MAJOR_COMPACTION_PERIOD. If your data size +grows too large, use the (post-0.90.0 HBase) org.apache.hadoop.hbase.util.RegionSplitter +script to perform a network IO safe rolling split +of all regions. + +
+ +
+ +
+
Client configuration and dependencies connecting to an HBase cluster + + + Since the HBase Master may move around, clients bootstrap by looking ZooKeeper. Thus clients + require the ZooKeeper quorum information in a hbase-site.xml that + is on their CLASSPATH. + If you are configuring an IDE to run a HBase client, you should + include the conf/ directory on your classpath. + + + Minimally, a client of HBase needs the hbase, hadoop, log4j, commons-logging, and zookeeper jars + in its CLASSPATH connecting to a cluster. + + + An example basic hbase-site.xml for client only + might look as follows: + + + + + hbase.zookeeper.quorum + example1,example2,example3 + The directory shared by region servers. + + + +]]> + + +
+ +
diff --git a/src/docbkx/getting_started.xml b/src/docbkx/getting_started.xml new file mode 100644 index 00000000000..a5e0335ad9c --- /dev/null +++ b/src/docbkx/getting_started.xml @@ -0,0 +1,853 @@ + + + Getting Started +
+ Introduction + + Quick Start will get you up and running + on a single-node instance of HBase using the local filesystem. + The Not-so-quick Start Guide + describes setup of HBase in distributed mode running on top of HDFS. + +
+ +
+ Quick Start + + This guide describes setup of a standalone HBase + instance that uses the local filesystem. It leads you + through creating a table, inserting rows via the + HBase Shell, and then cleaning up and shutting + down your standalone HBase instance. + The below exercise should take no more than + ten minutes (not including download time). + + +
+ Download and unpack the latest stable release. + + Choose a download site from this list of Apache + Download Mirrors. Click on suggested top link. This will take you to a + mirror of HBase Releases. Click on + the folder named stable and then download the + file that ends in .tar.gz to your local filesystem; + e.g. hbase-.tar.gz. + + Decompress and untar your download and then change into the + unpacked directory. + + $ tar xfz hbase-.tar.gz +$ cd hbase- + + + + At this point, you are ready to start HBase. But before starting it, + you might want to edit conf/hbase-site.xml + and set the directory you want HBase to write to, + hbase.rootdir. + + + + + + hbase.rootdir + file:///DIRECTORY/hbase + + +]]> + +Replace DIRECTORY in the above with a path to a directory where you want +HBase to store its data. By default, hbase.rootdir is +set to /tmp/hbase-${user.name} +which means you'll lose all your data whenever your server reboots +(Most operating systems clear /tmp on restart). + +
+
+Start HBase + + Now start HBase:$ ./bin/start-hbase.sh +starting Master, logging to logs/hbase-user-master-example.org.out + + You should + now have a running standalone HBase instance. In standalone mode, HBase runs + all daemons in the the one JVM; i.e. both the HBase and ZooKeeper daemons. + HBase logs can be found in the logs subdirectory. Check them + out especially if HBase had trouble starting. + + + Is <application>java</application> installed? + All of the above presumes a 1.6 version of Oracle + java is installed on your + machine and available on your path; i.e. when you type + java, you see output that describes the options + the java program takes (HBase requires java 6). If this is + not the case, HBase will not start. + Install java, edit conf/hbase-env.sh, uncommenting the + JAVA_HOME line pointing it to your java install. Then, + retry the steps above. + +
+ + +
+ Shell Exercises + Connect to your running HBase via the + HBase Shell. + + $ ./bin/hbase shell +HBase Shell; enter 'help<RETURN>' for list of supported commands. +Type "exit<RETURN>" to leave the HBase Shell +Version: 0.89.20100924, r1001068, Fri Sep 24 13:55:42 PDT 2010 + +hbase(main):001:0> + + Type help and then <RETURN> + to see a listing of shell + commands and options. Browse at least the paragraphs at the end of + the help emission for the gist of how variables and command + arguments are entered into the + HBase shell; in particular note how table names, rows, and + columns, etc., must be quoted. + + Create a table named test with a single + column family named cf. + Verify its creation by listing all tables and then insert some + values. + hbase(main):003:0> create 'test', 'cf' +0 row(s) in 1.2200 seconds +hbase(main):003:0> list 'table' +test +1 row(s) in 0.0550 seconds +hbase(main):004:0> put 'test', 'row1', 'cf:a', 'value1' +0 row(s) in 0.0560 seconds +hbase(main):005:0> put 'test', 'row2', 'cf:b', 'value2' +0 row(s) in 0.0370 seconds +hbase(main):006:0> put 'test', 'row3', 'cf:c', 'value3' +0 row(s) in 0.0450 seconds + + Above we inserted 3 values, one at a time. The first insert is at + row1, column cf:a with a value of + value1. + Columns in HBase are comprised of a + column family prefix + -- cf in this example -- followed by + a colon and then a column qualifier suffix (a in this case). + + + Verify the data insert. + + Run a scan of the table by doing the following + + hbase(main):007:0> scan 'test' +ROW COLUMN+CELL +row1 column=cf:a, timestamp=1288380727188, value=value1 +row2 column=cf:b, timestamp=1288380738440, value=value2 +row3 column=cf:c, timestamp=1288380747365, value=value3 +3 row(s) in 0.0590 seconds + + Get a single row as follows + + hbase(main):008:0> get 'test', 'row1' +COLUMN CELL +cf:a timestamp=1288380727188, value=value1 +1 row(s) in 0.0400 seconds + + Now, disable and drop your table. This will clean up all + done above. + + hbase(main):012:0> disable 'test' +0 row(s) in 1.0930 seconds +hbase(main):013:0> drop 'test' +0 row(s) in 0.0770 seconds + + Exit the shell by typing exit. + + hbase(main):014:0> exit +
+ +
+ Stopping HBase + Stop your hbase instance by running the stop script. + + $ ./bin/stop-hbase.sh +stopping hbase............... +
+ +
Where to go next + + The above described standalone setup is good for testing and experiments only. + Move on to the next section, the Not-so-quick Start Guide + where we'll go into depth on the different HBase run modes, requirements and critical + configurations needed setting up a distributed HBase deploy. + +
+
+ +
+ Not-so-quick Start Guide + +
Requirements + HBase has the following requirements. Please read the + section below carefully and ensure that all requirements have been + satisfied. Failure to do so will cause you (and us) grief debugging + strange errors and/or data loss. + + +
java + + Just like Hadoop, HBase requires java 6 from Oracle. +Usually you'll want to use the latest version available except the problematic u18 (u22 is the latest version as of this writing). +
+ +
<link xlink:href="http://hadoop.apache.org">hadoop</link><indexterm><primary>Hadoop</primary></indexterm> +This version of HBase will only run on Hadoop 0.20.x. + It will not run on hadoop 0.21.x (nor 0.22.x) as of this writing. + HBase will lose data unless it is running on an HDFS that has a + durable sync. Currently only the + branch-0.20-append + branch has this attribute + + + See CHANGES.txt + in branch-0.20-append to see list of patches involved adding append on the Hadoop 0.20 branch. + + . + No official releases have been made from this branch up to now + so you will have to build your own Hadoop from the tip of this branch. + Scroll down in the Hadoop How To Release to the section + Build Requirements for instruction on how to build Hadoop. + + + + Or rather than build your own, you could use + Cloudera's CDH3. + CDH has the 0.20-append patches needed to add a durable sync (CDH3 is still in beta. + Either CDH3b2 or CDH3b3 will suffice). + + + Because HBase depends on Hadoop, it bundles an instance of + the Hadoop jar under its lib directory. + The bundled Hadoop was made from the Apache branch-0.20-append branch + at the time of this HBase's release. + It is critical that the version of Hadoop that is + out on your cluster matches what is Hbase match. Replace the hadoop + jar found in the HBase lib directory with the + hadoop jar you are running out on your cluster to avoid version mismatch issues. + Make sure you replace the jar all over your cluster. + For example, versions of CDH do not have HDFS-724 whereas + Hadoops branch-0.20-append branch does have HDFS-724. This + patch changes the RPC version because protocol was changed. + Version mismatch issues have various manifestations but often all looks like its hung up. + + + Can I just replace the jar in Hadoop 0.20.2 tarball with the <emphasis>sync</emphasis>-supporting Hadoop jar found in HBase? + + You could do this. It works going by a recent posting up on the + mailing list. + + + Hadoop Security + HBase will run on any Hadoop 0.20.x that incorporates Hadoop security features -- e.g. Y! 0.20S or CDH3B3 -- as long + as you do as suggested above and replace the Hadoop jar that ships with HBase with the secure version. + + + +
+
ssh +ssh must be installed and sshd must +be running to use Hadoop's scripts to manage remote Hadoop and HBase daemons. + You must be able to ssh to all nodes, including your local node, using passwordless login (Google "ssh passwordless login"). + +
+
DNS + HBase uses the local hostname to self-report it's IP address. Both forward and reverse DNS resolving should work. + If your machine has multiple interfaces, HBase will use the interface that the primary hostname resolves to. + If this is insufficient, you can set hbase.regionserver.dns.interface to indicate the primary interface. + This only works if your cluster + configuration is consistent and every host has the same network interface configuration. + Another alternative is setting hbase.regionserver.dns.nameserver to choose a different nameserver than the + system wide default. +
+
NTP + + The clocks on cluster members should be in basic alignments. Some skew is tolerable but + wild skew could generate odd behaviors. Run NTP + on your cluster, or an equivalent. + + If you are having problems querying data, or "weird" cluster operations, check system time! +
+ + +
+ <varname>ulimit</varname><indexterm><primary>ulimit</primary></indexterm> + HBase is a database, it uses a lot of files at the same time. + The default ulimit -n of 1024 on *nix systems is insufficient. + Any significant amount of loading will lead you to + FAQ: Why do I see "java.io.IOException...(Too many open files)" in my logs?. + You may also notice errors such as + + 2010-04-06 03:04:37,542 INFO org.apache.hadoop.hdfs.DFSClient: Exception increateBlockOutputStream java.io.EOFException + 2010-04-06 03:04:37,542 INFO org.apache.hadoop.hdfs.DFSClient: Abandoning block blk_-6935524980745310745_1391901 + + Do yourself a favor and change the upper bound on the number of file descriptors. + Set it to north of 10k. See the above referenced FAQ for how. + To be clear, upping the file descriptors for the user who is + running the HBase process is an operating system configuration, not an + HBase configuration. Also, a common mistake is that administrators + will up the file descriptors for a particular user but for whatever reason, + HBase will be running as some one else. HBase prints in its logs + as the first line the ulimit its seeing. Ensure its correct. + + A useful read setting config on you hadoop cluster is Aaron Kimballs' + Configuration Parameters: What can you just ignore? + + + +
+ <varname>ulimit</varname> on Ubuntu + + If you are on Ubuntu you will need to make the following changes: + + In the file /etc/security/limits.conf add a line like: + hadoop - nofile 32768 + Replace hadoop + with whatever user is running Hadoop and HBase. If you have + separate users, you will need 2 entries, one for each user. + + + In the file /etc/pam.d/common-session add as the last line in the file: + session required pam_limits.so + Otherwise the changes in /etc/security/limits.conf won't be applied. + + + Don't forget to log out and back in again for the changes to take effect! + +
+
+ +
+ <varname>dfs.datanode.max.xcievers</varname><indexterm><primary>xcievers</primary></indexterm> + + An Hadoop HDFS datanode has an upper bound on the number of files + that it will serve at any one time. + The upper bound parameter is called + xcievers (yes, this is misspelled). Again, before + doing any loading, make sure you have configured + Hadoop's conf/hdfs-site.xml + setting the xceivers value to at least the following: + + <property> + <name>dfs.datanode.max.xcievers</name> + <value>4096</value> + </property> + + + Be sure to restart your HDFS after making the above + configuration. + Not having this configuration in place makes for strange looking + failures. Eventually you'll see a complain in the datanode logs + complaining about the xcievers exceeded, but on the run up to this + one manifestation is complaint about missing blocks. For example: + 10/12/08 20:10:31 INFO hdfs.DFSClient: Could not obtain block blk_XXXXXXXXXXXXXXXXXXXXXX_YYYYYYYY from any node: java.io.IOException: No live nodes contain current block. Will get new block locations from namenode and retry... + +
+ +
+Windows + +HBase has been little tested running on windows. +Running a production install of HBase on top of +windows is not recommended. + + +If you are running HBase on Windows, you must install +Cygwin +to have a *nix-like environment for the shell scripts. The full details +are explained in the Windows Installation +guide. + +
+ +
+ +
HBase run modes: Standalone and Distributed + HBase has two run modes: standalone + and distributed. + Out of the box, HBase runs in standalone mode. To set up a + distributed deploy, you will need to configure HBase by editing + files in the HBase conf directory. + +Whatever your mode, you will need to edit conf/hbase-env.sh +to tell HBase which java to use. In this file +you set HBase environment variables such as the heapsize and other options +for the JVM, the preferred location for log files, etc. +Set JAVA_HOME to point at the root of your +java install. + +
Standalone HBase + This is the default mode. Standalone mode is + what is described in the quickstart + section. In standalone mode, HBase does not use HDFS -- it uses the local + filesystem instead -- and it runs all HBase daemons and a local zookeeper + all up in the same JVM. Zookeeper binds to a well known port so clients may + talk to HBase. + +
+
Distributed + Distributed mode can be subdivided into distributed but all daemons run on a + single node -- a.k.a pseudo-distributed-- and + fully-distributed where the daemons + are spread across all nodes in the cluster + The pseudo-distributed vs fully-distributed nomenclature comes from Hadoop.. + + Distributed modes require an instance of the + Hadoop Distributed File System (HDFS). See the + Hadoop + requirements and instructions for how to set up a HDFS. + Before proceeding, ensure you have an appropriate, working HDFS. + + Below we describe the different distributed setups. + Starting, verification and exploration of your install, whether a + pseudo-distributed or fully-distributed + configuration is described in a section that follows, + Running and Confirming your Installation. + The same verification script applies to both deploy types. + +
Pseudo-distributed +A pseudo-distributed mode is simply a distributed mode run on a single host. +Use this configuration testing and prototyping on HBase. Do not use this configuration +for production nor for evaluating HBase performance. + +Once you have confirmed your HDFS setup, +edit conf/hbase-site.xml. This is the file +into which you add local customizations and overrides for +Default HBase Configurations +and HDFS Client Configurations. +Point HBase at the running Hadoop HDFS instance by setting the +hbase.rootdir property. +This property points HBase at the Hadoop filesystem instance to use. +For example, adding the properties below to your +hbase-site.xml says that HBase +should use the /hbase +directory in the HDFS whose namenode is at port 9000 on your local machine, and that +it should run with one replica only (recommended for pseudo-distributed mode): + +<configuration> + ... + <property> + <name>hbase.rootdir</name> + <value>hdfs://localhost:9000/hbase</value> + <description>The directory shared by region servers. + </description> + </property> + <property> + <name>dfs.replication</name> + <value>1</value> + <description>The replication count for HLog & HFile storage. Should not be greater than HDFS datanode count. + </description> + </property> + ... +</configuration> + + + +Let HBase create the hbase.rootdir +directory. If you don't, you'll get warning saying HBase +needs a migration run because the directory is missing files +expected by HBase (it'll create them if you let it). + + + +Above we bind to localhost. +This means that a remote client cannot +connect. Amend accordingly, if you want to +connect from a remote location. + + +Now skip to Running and Confirming your Installation +for how to start and verify your pseudo-distributed install. + + + See Pseudo-distributed mode extras +for notes on how to start extra Masters and regionservers when running + pseudo-distributed. + + + +
+ +
Fully-distributed + +For running a fully-distributed operation on more than one host, make +the following configurations. In hbase-site.xml, +add the property hbase.cluster.distributed +and set it to true and point the HBase +hbase.rootdir at the appropriate +HDFS NameNode and location in HDFS where you would like +HBase to write data. For example, if you namenode were running +at namenode.example.org on port 9000 and you wanted to home +your HBase in HDFS at /hbase, +make the following configuration. + +<configuration> + ... + <property> + <name>hbase.rootdir</name> + <value>hdfs://namenode.example.org:9000/hbase</value> + <description>The directory shared by region servers. + </description> + </property> + <property> + <name>hbase.cluster.distributed</name> + <value>true</value> + <description>The mode the cluster will be in. Possible values are + false: standalone and pseudo-distributed setups with managed Zookeeper + true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh) + </description> + </property> + ... +</configuration> + + +
<filename>regionservers</filename> +In addition, a fully-distributed mode requires that you +modify conf/regionservers. +The regionservers file lists all hosts +that you would have running HRegionServers, one host per line +(This file in HBase is like the Hadoop slaves file). All servers +listed in this file will be started and stopped when HBase cluster start or stop is run. +
+ +
ZooKeeper<indexterm><primary>ZooKeeper</primary></indexterm> +A distributed HBase depends on a running ZooKeeper cluster. +All participating nodes and clients +need to be able to access the running ZooKeeper ensemble. +HBase by default manages a ZooKeeper "cluster" for you. +It will start and stop the ZooKeeper ensemble as part of +the HBase start/stop process. You can also manage +the ZooKeeper ensemble independent of HBase and +just point HBase at the cluster it should use. +To toggle HBase management of ZooKeeper, +use the HBASE_MANAGES_ZK variable in +conf/hbase-env.sh. +This variable, which defaults to true, tells HBase whether to +start/stop the ZooKeeper ensemble servers as part of HBase start/stop. + +When HBase manages the ZooKeeper ensemble, you can specify ZooKeeper configuration +using its native zoo.cfg file, or, the easier option +is to just specify ZooKeeper options directly in conf/hbase-site.xml. +A ZooKeeper configuration option can be set as a property in the HBase +hbase-site.xml +XML configuration file by prefacing the ZooKeeper option name with +hbase.zookeeper.property. +For example, the clientPort setting in ZooKeeper can be changed by +setting the hbase.zookeeper.property.clientPort property. + +For all default values used by HBase, including ZooKeeper configuration, +see the section +Default HBase Configurations. +Look for the hbase.zookeeper.property prefix + +For the full list of ZooKeeper configurations, +see ZooKeeper's zoo.cfg. +HBase does not ship with a zoo.cfg so you will need to +browse the conf directory in an appropriate ZooKeeper download. + + + + + + +You must at least list the ensemble servers in hbase-site.xml +using the hbase.zookeeper.quorum property. +This property defaults to a single ensemble member at +localhost which is not suitable for a +fully distributed HBase. (It binds to the local machine only and remote clients +will not be able to connect). + +How many ZooKeepers should I run? + +You can run a ZooKeeper ensemble that comprises 1 node only but +in production it is recommended that you run a ZooKeeper ensemble of +3, 5 or 7 machines; the more members an ensemble has, the more +tolerant the ensemble is of host failures. Also, run an odd number of machines. +There can be no quorum if the number of members is an even number. Give each +ZooKeeper server around 1GB of RAM, and if possible, its own dedicated disk +(A dedicated disk is the best thing you can do to ensure a performant ZooKeeper +ensemble). For very heavily loaded clusters, run ZooKeeper servers on separate machines from +RegionServers (DataNodes and TaskTrackers). + + + + +For example, to have HBase manage a ZooKeeper quorum on nodes +rs{1,2,3,4,5}.example.com, bound to port 2222 (the default is 2181) +ensure HBASE_MANAGE_ZK is commented out or set to +true in conf/hbase-env.sh and +then edit conf/hbase-site.xml and set +hbase.zookeeper.property.clientPort +and +hbase.zookeeper.quorum. You should also +set +hbase.zookeeper.property.dataDir +to other than the default as the default has ZooKeeper persist data under +/tmp which is often cleared on system restart. +In the example below we have ZooKeeper persist to /user/local/zookeeper. + + <configuration> + ... + <property> + <name>hbase.zookeeper.property.clientPort</name> + <value>2222</value> + <description>Property from ZooKeeper's config zoo.cfg. + The port at which the clients will connect. + </description> + </property> + <property> + <name>hbase.zookeeper.quorum</name> + <value>rs1.example.com,rs2.example.com,rs3.example.com,rs4.example.com,rs5.example.com</value> + <description>Comma separated list of servers in the ZooKeeper Quorum. + For example, "host1.mydomain.com,host2.mydomain.com,host3.mydomain.com". + By default this is set to localhost for local and pseudo-distributed modes + of operation. For a fully-distributed setup, this should be set to a full + list of ZooKeeper quorum servers. If HBASE_MANAGES_ZK is set in hbase-env.sh + this is the list of servers which we will start/stop ZooKeeper on. + </description> + </property> + <property> + <name>hbase.zookeeper.property.dataDir</name> + <value>/usr/local/zookeeper</value> + <description>Property from ZooKeeper's config zoo.cfg. + The directory where the snapshot is stored. + </description> + </property> + ... + </configuration> + + +
Using existing ZooKeeper ensemble +To point HBase at an existing ZooKeeper cluster, +one that is not managed by HBase, +set HBASE_MANAGES_ZK in +conf/hbase-env.sh to false + + ... + # Tell HBase whether it should manage it's own instance of Zookeeper or not. + export HBASE_MANAGES_ZK=false + +Next set ensemble locations and client port, if non-standard, +in hbase-site.xml, +or add a suitably configured zoo.cfg to HBase's CLASSPATH. +HBase will prefer the configuration found in zoo.cfg +over any settings in hbase-site.xml. + + +When HBase manages ZooKeeper, it will start/stop the ZooKeeper servers as a part +of the regular start/stop scripts. If you would like to run ZooKeeper yourself, +independent of HBase start/stop, you would do the following + +${HBASE_HOME}/bin/hbase-daemons.sh {start,stop} zookeeper + + +Note that you can use HBase in this manner to spin up a ZooKeeper cluster, +unrelated to HBase. Just make sure to set HBASE_MANAGES_ZK to +false if you want it to stay up across HBase restarts +so that when HBase shuts down, it doesn't take ZooKeeper down with it. + +For more information about running a distinct ZooKeeper cluster, see +the ZooKeeper Getting Started Guide. + +
+
+ +
+HDFS Client Configuration +Of note, if you have made HDFS client configuration on your Hadoop cluster +-- i.e. configuration you want HDFS clients to use as opposed to server-side configurations -- +HBase will not see this configuration unless you do one of the following: + + Add a pointer to your HADOOP_CONF_DIR + to the HBASE_CLASSPATH environment variable + in hbase-env.sh. + Add a copy of hdfs-site.xml + (or hadoop-site.xml) or, better, symlinks, + under + ${HBASE_HOME}/conf, or + if only a small set of HDFS client + configurations, add them to hbase-site.xml. + + +An example of such an HDFS client configuration is dfs.replication. If for example, +you want to run with a replication factor of 5, hbase will create files with the default of 3 unless +you do the above to make the configuration available to HBase. +
+
+
+ +
Running and Confirming Your Installation +Make sure HDFS is running first. +Start and stop the Hadoop HDFS daemons by running bin/start-hdfs.sh +over in the HADOOP_HOME directory. +You can ensure it started properly by testing the put and +get of files into the Hadoop filesystem. +HBase does not normally use the mapreduce daemons. These do not need to be started. + +If you are managing your own ZooKeeper, start it +and confirm its running else, HBase will start up ZooKeeper for you as part +of its start process. + +Start HBase with the following command: +bin/start-hbase.sh +Run the above from the HBASE_HOME directory. + +You should now have a running HBase instance. +HBase logs can be found in the logs subdirectory. Check them +out especially if HBase had trouble starting. + +HBase also puts up a UI listing vital attributes. By default its deployed on the Master host +at port 60010 (HBase RegionServers listen on port 60020 by default and put up an informational +http server at 60030). If the Master were running on a host named master.example.org +on the default port, to see the Master's homepage you'd point your browser at +http://master.example.org:60010. + +Once HBase has started, see the +Shell Exercises section for how to +create tables, add data, scan your insertions, and finally disable and +drop your tables. + + +To stop HBase after exiting the HBase shell enter +$ ./bin/stop-hbase.sh +stopping hbase............... +Shutdown can take a moment to complete. It can take longer if your cluster +is comprised of many machines. If you are running a distributed operation, +be sure to wait until HBase has shut down completely +before stopping the Hadoop daemons. + + + +
+
+ + + + + + +
Example Configurations +
Basic Distributed HBase Install + Here is an example basic configuration for a distributed ten node cluster. + The nodes are named example0, example1, etc., through +node example9 in this example. The HBase Master and the HDFS namenode +are running on the node example0. RegionServers run on nodes +example1-example9. +A 3-node ZooKeeper ensemble runs on example1, +example2, and example3 on the +default ports. ZooKeeper data is persisted to the directory +/export/zookeeper. +Below we show what the main configuration files +-- hbase-site.xml, regionservers, and +hbase-env.sh -- found in the HBase +conf directory might look like. + +
<filename>hbase-site.xml</filename> + + + + + + hbase.zookeeper.quorum + example1,example2,example3 + The directory shared by region servers. + + + + hbase.zookeeper.property.dataDir + /export/zookeeper + Property from ZooKeeper's config zoo.cfg. + The directory where the snapshot is stored. + + + + hbase.rootdir + hdfs://example0:9000/hbase + The directory shared by region servers. + + + + hbase.cluster.distributed + true + The mode the cluster will be in. Possible values are + false: standalone and pseudo-distributed setups with managed Zookeeper + true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh) + + + +]]> + +
+ +
<filename>regionservers</filename> + In this file you list the nodes that will run regionservers. In + our case we run regionservers on all but the head node + example1 which is + carrying the HBase Master and the HDFS namenode + + example1 + example3 + example4 + example5 + example6 + example7 + example8 + example9 + +
+ +
<filename>hbase-env.sh</filename> + Below we use a diff to show the differences from + default in the hbase-env.sh file. Here we are setting +the HBase heap to be 4G instead of the default 1G. + + + + + + Use rsync to copy the content of + the conf directory to + all nodes of the cluster. + +
+ +
+ +
+
+ +
diff --git a/src/docbkx/performance.xml b/src/docbkx/performance.xml new file mode 100644 index 00000000000..f62ce315a6c --- /dev/null +++ b/src/docbkx/performance.xml @@ -0,0 +1,39 @@ + + + + Performance Tuning + Start with the wiki Performance Tuning page. + It has a general discussion of the main factors involved; RAM, compression, JVM settings, etc. + Afterward, come back here for more pointers. + +
+ Java +
+ The Garage Collector and HBase +
+ Long GC pauses + + In his presentation, + Avoiding Full GCs with MemStore-Local Allocation Buffers, + Todd Lipcon describes two cases of stop-the-world garbage collections common in HBase, especially during loading; + CMS failure modes and old generation heap fragmentation brought. To address the first, + start the CMS earlier than default by adding -XX:CMSInitiatingOccupancyFraction + and setting it down from defaults. Start at 60 or 70 percent (The lower you bring down + the threshold, the more GCing is done, the more CPU used). To address the second + fragmentation issue, Todd added an experimental facility that must be + explicitly enabled in HBase 0.90.x (Its defaulted to be on in 0.92.x HBase). See + hbase.hregion.memstore.mslab.enabled to true in your + Configuration. See the cited slides for background and + detail. + +
+
+
+
diff --git a/src/docbkx/preface.xml b/src/docbkx/preface.xml new file mode 100644 index 00000000000..16b2390fc03 --- /dev/null +++ b/src/docbkx/preface.xml @@ -0,0 +1,27 @@ + + + Preface + + This book aims to be the official guide for the HBase version it ships with. + This document describes HBase version . + Herein you will find either the definitive documentation on an HBase topic + as of its standing when the referenced HBase version shipped, or + this book will point to the location in javadoc, + JIRA + or wiki + where the pertinent information can be found. + + This book is a work in progress. It is lacking in many areas but we + hope to fill in the holes with time. Feel free to add to this book should + by adding a patch to an issue up in the HBase JIRA. + diff --git a/src/docbkx/shell.xml b/src/docbkx/shell.xml new file mode 100644 index 00000000000..e25846b2020 --- /dev/null +++ b/src/docbkx/shell.xml @@ -0,0 +1,89 @@ + + + The HBase Shell + + + The HBase Shell is (J)Ruby's + IRB with some HBase particular verbs added. Anything you can do in + IRB, you should be able to do in the HBase Shell. + To run the HBase shell, + do as follows: + $ ./bin/hbase shell + + Type help and then <RETURN> + to see a listing of shell + commands and options. Browse at least the paragraphs at the end of + the help emission for the gist of how variables and command + arguments are entered into the + HBase shell; in particular note how table names, rows, and + columns, etc., must be quoted. + See Shell Exercises + for example basic shell operation. + +
Scripting + For examples scripting HBase, look in the + HBase bin directory. Look at the files + that end in *.rb. To run one of these + files, do as follows: + $ ./bin/hbase org.jruby.Main PATH_TO_SCRIPT + +
+ +
Shell Tricks +
<filename>irbrc</filename> + Create an .irbrc file for yourself in your + home directory. Add customizations. A useful one is + command history so commands are save across Shell invocations: + + $ more .irbrc + require 'irb/ext/save-history' + IRB.conf[:SAVE_HISTORY] = 100 + IRB.conf[:HISTORY_FILE] = "#{ENV['HOME']}/.irb-save-history" + See the ruby documentation of + .irbrc to learn about other possible + confiurations. + +
+
LOG data to timestamp + + To convert the date '08/08/16 20:56:29' from an hbase log into a timestamp, do: + + hbase(main):021:0> import java.text.SimpleDateFormat + hbase(main):022:0> import java.text.ParsePosition + hbase(main):023:0> SimpleDateFormat.new("yy/MM/dd HH:mm:ss").parse("08/08/16 20:56:29", ParsePosition.new(0)).getTime() => 1218920189000 + + + To go the other direction: + + hbase(main):021:0> import java.util.Date + hbase(main):022:0> Date.new(1218920189000).toString() => "Sat Aug 16 20:56:29 UTC 2008" + + + To output in a format that is exactly like that of the HBase log format will take a little messing with + SimpleDateFormat. + +
+
Debug +
Shell debug switch + You can set a debug switch in the shell to see more output + -- e.g. more of the stack trace on exception -- + when you run a command: + hbase> debug <RETURN> + +
+
DEBUG log level + To enable DEBUG level logging in the shell, + launch it with the -d option. + $ ./bin/hbase shell -d + +
+
+
+
diff --git a/src/docbkx/upgrading.xml b/src/docbkx/upgrading.xml new file mode 100644 index 00000000000..2c8ca6b7d22 --- /dev/null +++ b/src/docbkx/upgrading.xml @@ -0,0 +1,55 @@ + + + Upgrading + + Review the requirements + section above, in particular the section on Hadoop version. + +
+ Upgrading to HBase 0.90.x from 0.20.x or 0.89.x + This version of 0.90.x HBase can be started on data written by + HBase 0.20.x or HBase 0.89.x. There is no need of a migration step. + HBase 0.89.x and 0.90.x does write out the name of region directories + differently -- it names them with a md5 hash of the region name rather + than a jenkins hash -- so this means that once started, there is no + going back to HBase 0.20.x. + + + Be sure to remove the hbase-default.xml from + your conf + directory on upgrade. A 0.20.x version of this file will have + sub-optimal configurations for 0.90.x HBase. The + hbase-default.xml file is now bundled into the + HBase jar and read from there. If you would like to review + the content of this file, see it in the src tree at + src/main/resources/hbase-default.xml or + see Default HBase Configurations. + + + Finally, if upgrading from 0.20.x, check your + .META. schema in the shell. In the past we would + recommend that users run with a 16kb + MEMSTORE_FLUSHSIZE. + Run hbase> scan '-ROOT-' in the shell. This will output + the current .META. schema. Check + MEMSTORE_FLUSHSIZE size. Is it 16kb (16384)? If so, you will + need to change this (The 'normal'/default value is 64MB (67108864)). + Run the script bin/set_meta_memstore_size.rb. + This will make the necessary edit to your .META. schema. + Failure to run this change will make for a slow cluster + + See HBASE-3499 Users upgrading to 0.90.0 need to have their .META. table updated with the right MEMSTORE_SIZE + + + . + + +
+