From 26a45480474d69586583ef72dfeff9f183bba522 Mon Sep 17 00:00:00 2001
From: Doug Meil <dmeil@apache.org>
Date: Fri, 26 Aug 2011 20:45:51 +0000
Subject: [PATCH] HBASE-4262 adding ops_mgt.xml

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1162245 13f79535-47bb-0310-9956-ffa450edef68
---
 src/docbkx/book.xml    | 213 +-----------------------------------
 src/docbkx/ops_mgt.xml | 237 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 239 insertions(+), 211 deletions(-)
 create mode 100644 src/docbkx/ops_mgt.xml
diff --git a/src/docbkx/book.xml b/src/docbkx/book.xml
index 433ed17c50d..7b5f6d0b3fa 100644
--- a/src/docbkx/book.xml
+++ b/src/docbkx/book.xml
@@ -1403,216 +1403,7 @@ HTable table2 = new HTable(conf2, "myTable");</programlisting>
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="build.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="developer.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="external_apis.xml" />
-
-  <appendix xml:id="tools">
-    <title >Tools</title>
-
-    <para>Here we list HBase tools for administration, analysis, fixup, and
-    debugging.</para>
-    <section xml:id="hbck">
-        <title>HBase <application>hbck</application></title>
-        <subtitle>An <emphasis>fsck</emphasis> for your HBase install</subtitle>
-        <para>To run <application>hbck</application> against your HBase cluster run
-        <programlisting>$ ./bin/hbase hbck</programlisting>
-        At the end of the commands output it prints <emphasis>OK</emphasis>
-        or <emphasis>INCONSISTENCY</emphasis>. If your cluster reports
-        inconsistencies, pass <command>-details</command> to see more detail emitted.
-        If inconsistencies, run <command>hbck</command> a few times because the
-        inconsistency may be transient (e.g. cluster is starting up or a region is
-        splitting).
-        Passing <command>-fix</command> may correct the inconsistency (This latter
-        is an experimental feature).
-        </para>
-    </section>
-    <section xml:id="hfile_tool2"><title>HFile Tool</title>
-        <para>See <xref linkend="hfile_tool" />.</para>
-    </section>
-    <section xml:id="wal_tools">
-      <title>WAL Tools</title>
-
-      <section xml:id="hlog_tool">
-        <title><classname>HLog</classname> tool</title>
-
-        <para>The main method on <classname>HLog</classname> offers manual
-        split and dump facilities. Pass it WALs or the product of a split, the
-        content of the <filename>recovered.edits</filename>. directory.</para>
-
-        <para>You can get a textual dump of a WAL file content by doing the
-        following:<programlisting> <code>$ ./bin/hbase org.apache.hadoop.hbase.regionserver.wal.HLog --dump hdfs://example.org:8020/hbase/.logs/example.org,60020,1283516293161/10.10.21.10%3A60020.1283973724012</code> </programlisting>The
-        return code will be non-zero if issues with the file so you can test
-        wholesomeness of file by redirecting <varname>STDOUT</varname> to
-        <code>/dev/null</code> and testing the program return.</para>
-
-        <para>Similarily you can force a split of a log file directory by
-        doing:<programlisting> $ ./<code>bin/hbase org.apache.hadoop.hbase.regionserver.wal.HLog --split hdfs://example.org:8020/hbase/.logs/example.org,60020,1283516293161/</code></programlisting></para>
-      </section>
-    </section>
-    <section xml:id="compression.tool"><title>Compression Tool</title>
-        <para>See <xref linkend="compression.tool" />.</para>
-    </section>
-    <section xml:id="decommission"><title>Node Decommission</title>
-        <para>You can stop an individual RegionServer by running the following
-            script in the HBase directory on the particular  node:
-            <programlisting>$ ./bin/hbase-daemon.sh stop regionserver</programlisting>
-            The RegionServer will first close all regions and then shut itself down.
-            On shutdown, the RegionServer's ephemeral node in ZooKeeper will expire.
-            The master will notice the RegionServer gone and will treat it as
-            a 'crashed' server; it will reassign the nodes the RegionServer was carrying.
-            <note><title>Disable the Load Balancer before Decommissioning a node</title>
-             <para>If the load balancer runs while a node is shutting down, then
-                 there could be contention between the Load Balancer and the
-                 Master's recovery of the just decommissioned RegionServer.
-                 Avoid any problems by disabling the balancer first.
-                 See <xref linkend="lb" /> below.
-             </para>
-            </note>
-        </para>
-        <para>
-        A downside to the above stop of a RegionServer is that regions could be offline for
-        a good period of time.  Regions are closed in order.  If many regions on the server, the
-        first region to close may not be back online until all regions close and after the master
-        notices the RegionServer's znode gone.  In HBase 0.90.2, we added facility for having
-        a node gradually shed its load and then shutdown itself down.  HBase 0.90.2 added the
-            <filename>graceful_stop.sh</filename> script.  Here is its usage:
-            <programlisting>$ ./bin/graceful_stop.sh 
-Usage: graceful_stop.sh [--config &amp;conf-dir>] [--restart] [--reload] [--thrift] [--rest] &amp;hostname>
- thrift      If we should stop/start thrift before/after the hbase stop/start
- rest        If we should stop/start rest before/after the hbase stop/start
- restart     If we should restart after graceful stop
- reload      Move offloaded regions back on to the stopped server
- debug       Move offloaded regions back on to the stopped server
- hostname    Hostname of server we are to stop</programlisting>
-        </para>
-        <para>
-            To decommission a loaded RegionServer, run the following:
-            <programlisting>$ ./bin/graceful_stop.sh HOSTNAME</programlisting>
-            where <varname>HOSTNAME</varname> is the host carrying the RegionServer
-            you would decommission.  
-            <note><title>On <varname>HOSTNAME</varname></title>
-                <para>The <varname>HOSTNAME</varname> passed to <filename>graceful_stop.sh</filename>
-            must match the hostname that hbase is using to identify RegionServers.
-            Check the list of RegionServers in the master UI for how HBase is
-            referring to servers. Its usually hostname but can also be FQDN.
-            Whatever HBase is using, this is what you should pass the
-            <filename>graceful_stop.sh</filename> decommission
-            script.  If you pass IPs, the script is not yet smart enough to make
-            a hostname (or FQDN) of it and so it will fail when it checks if server is
-            currently running; the graceful unloading of regions will not run.
-            </para>
-        </note> The <filename>graceful_stop.sh</filename> script will move the regions off the
-            decommissioned RegionServer one at a time to minimize region churn.
-            It will verify the region deployed in the new location before it
-            will moves the next region and so on until the decommissioned server
-            is carrying zero regions.  At this point, the <filename>graceful_stop.sh</filename>
-            tells the RegionServer <command>stop</command>.  The master will at this point notice the
-            RegionServer gone but all regions will have already been redeployed
-            and because the RegionServer went down cleanly, there will be no
-            WAL logs to split.
-            <note xml:id="lb"><title>Load Balancer</title>
-            <para> 
-                It is assumed that the Region Load Balancer is disabled while the
-                <command>graceful_stop</command> script runs (otherwise the balancer
-                and the decommission script will end up fighting over region deployments).
-                Use the shell to disable the balancer:
-                <programlisting>hbase(main):001:0> balance_switch false
-true
-0 row(s) in 0.3590 seconds</programlisting>
-This turns the balancer OFF.  To reenable, do:
-                <programlisting>hbase(main):001:0> balance_switch true
-false
-0 row(s) in 0.3590 seconds</programlisting>
-            </para> 
-        </note>
-        </para>
-        <section xml:id="rolling">
-            <title>Rolling Restart</title>
-        <para>
-            You can also ask this script to restart a RegionServer after the shutdown
-            AND move its old regions back into place.  The latter you might do to
-            retain data locality.  A primitive rolling restart might be effected by
-            running something like the following:
-            <programlisting>$ for i in `cat conf/regionservers|sort`; do ./bin/graceful_stop.sh --restart --reload --debug $i; done &amp;> /tmp/log.txt &amp;
-            </programlisting>
-            Tail the output of <filename>/tmp/log.txt</filename> to follow the scripts
-            progress. The above does RegionServers only.  Be sure to disable the
-            load balancer before doing the above.  You'd need to do the master
-            update separately.  Do it before you run the above script.
-            Here is a pseudo-script for how you might craft a rolling restart script:
-            <orderedlist>
-                <listitem><para>Untar your release, make sure of its configuration and
-                        then rsync it across the cluster. If this is 0.90.2, patch it
-                        with HBASE-3744 and HBASE-3756.
-                    </para>
-                </listitem>
-                <listitem>
-                    <para>Run hbck to ensure the cluster consistent
-                        <programlisting>$ ./bin/hbase hbck</programlisting>
-                    Effect repairs if inconsistent.
-                    </para>
-                </listitem>
-                <listitem>
-                    <para>Restart the Master: <programlisting>$ ./bin/hbase-daemon.sh stop master; ./bin/hbase-daemon.sh start master</programlisting>
-                    </para>
-                </listitem>
-                <listitem>
-                    <para>
-                       Disable the region balancer:<programlisting>$ echo "balance_switch false" | ./bin/hbase shell</programlisting>
-                    </para>
-                </listitem>
-                <listitem>
-                     <para>Run the <filename>graceful_stop.sh</filename> script per RegionServer.  For example:
-            <programlisting>$ for i in `cat conf/regionservers|sort`; do ./bin/graceful_stop.sh --restart --reload --debug $i; done &amp;> /tmp/log.txt &amp;
-            </programlisting>
-                     If you are running thrift or rest servers on the RegionServer, pass --thrift or --rest options (See usage
-                     for <filename>graceful_stop.sh</filename> script).
-                 </para>
-                </listitem>
-                <listitem>
-                    <para>Restart the Master again.  This will clear out dead servers list and reenable the balancer.
-                    </para>
-                </listitem>
-                <listitem>
-                    <para>Run hbck to ensure the cluster is consistent.
-                    </para>
-                </listitem>
-            </orderedlist>
-        </para>
-    </section>
-    
-    </section>
-    <section xml:id="copytable">
-        <title>CopyTable</title>
-      <para>
-            CopyTable is a utility that can copy part or of all of a table, either to the same cluster or another cluster. The usage is as follows:
-<programlisting>$ bin/hbase org.apache.hadoop.hbase.mapreduce.CopyTable [--rs.class=CLASS] [--rs.impl=IMPL] [--starttime=X] [--endtime=Y] [--new.name=NEW] [--peer.adr=ADR] tablename
-</programlisting>
-        </para>
-        <para>
-        Options:
-        <itemizedlist>
-          <listitem><varname>rs.class</varname> hbase.regionserver.class of the peer cluster.  Specify if different from current cluster.</listitem>
-          <listitem><varname>rs.impl</varname>  hbase.regionserver.impl of the peer cluster. </listitem>
-          <listitem><varname>starttime</varname>  Beginning of the time range.  Without endtime means starttime to forever.</listitem>
-          <listitem><varname>endtime</varname>  End of the time range.  Without endtime means starttime to forever.</listitem>
-          <listitem><varname>new.name</varname>  New table's name.</listitem>
-          <listitem><varname>peer.adr</varname>  Address of the peer cluster given in the format hbase.zookeeper.quorum:hbase.zookeeper.client.port:zookeeper.znode.parent</listitem>
-          <listitem><varname>families</varname>  Comma-separated list of ColumnFamilies to copy.</listitem>
-        </itemizedlist>
-         Args:
-        <itemizedlist>
-          <listitem>tablename  Name of table to copy.</listitem>
-        </itemizedlist>
-        </para>
-        <para>Example of copying 'TestTable' to a cluster that uses replication for a 1 hour window:
-<programlisting>$ bin/hbase org.apache.hadoop.hbase.mapreduce.CopyTable
---rs.class=org.apache.hadoop.hbase.ipc.ReplicationRegionInterface
---rs.impl=org.apache.hadoop.hbase.regionserver.replication.ReplicationRegionServer
---starttime=1265875194289 --endtime=1265878794289
---peer.adr=server1,server2,server3:2181:/hbase TestTable</programlisting>
-        </para>
-    </section>    
-    
-  </appendix>
+  <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="ops_mgt.xml" />
 
   <appendix xml:id="compression">
 
@@ -1852,7 +1643,7 @@ When I build, why do I always get <code>Unable to find resource 'VM_global_libra
             </para></question>
             <answer>
                 <para>
-                    See <link xlink:href="http://blog.sematext.com/2011/03/11/hbase-backup-options/">HBase Backup Options</link> over on the Sematext Blog.
+                    See <xref linkend="ops.backup" />
                 </para>
             </answer>
         </qandaentry>
diff --git a/src/docbkx/ops_mgt.xml b/src/docbkx/ops_mgt.xml
new file mode 100644
index 00000000000..35819fb1672
--- /dev/null
+++ b/src/docbkx/ops_mgt.xml
@@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<chapter version="5.0" xml:id="ops_mgt"
+         xmlns="http://docbook.org/ns/docbook"
+         xmlns:xlink="http://www.w3.org/1999/xlink"
+         xmlns:xi="http://www.w3.org/2001/XInclude"
+         xmlns:svg="http://www.w3.org/2000/svg"
+         xmlns:m="http://www.w3.org/1998/Math/MathML"
+         xmlns:html="http://www.w3.org/1999/xhtml"
+         xmlns:db="http://docbook.org/ns/docbook">
+  <title>HBase Operational Management</title>
+  This chapter will cover operational tools and practices required of a running HBase cluster.
+  The subject of operations is related to the topics of <xref linkend="trouble" />, <xref linkend="performance"/>,
+  and <xref linkend="configuration" /> but is a distinct topic in itself.  
+  
+  <section xml:id="tools">
+    <title >HBase Tools and Utilities</title>
+
+    <para>Here we list HBase tools for administration, analysis, fixup, and
+    debugging.</para>
+    <section xml:id="hbck">
+        <title>HBase <application>hbck</application></title>
+        <subtitle>An <emphasis>fsck</emphasis> for your HBase install</subtitle>
+        <para>To run <application>hbck</application> against your HBase cluster run
+        <programlisting>$ ./bin/hbase hbck</programlisting>
+        At the end of the commands output it prints <emphasis>OK</emphasis>
+        or <emphasis>INCONSISTENCY</emphasis>. If your cluster reports
+        inconsistencies, pass <command>-details</command> to see more detail emitted.
+        If inconsistencies, run <command>hbck</command> a few times because the
+        inconsistency may be transient (e.g. cluster is starting up or a region is
+        splitting).
+        Passing <command>-fix</command> may correct the inconsistency (This latter
+        is an experimental feature).
+        </para>
+    </section>
+    <section xml:id="hfile_tool2"><title>HFile Tool</title>
+        <para>See <xref linkend="hfile_tool" />.</para>
+    </section>
+    <section xml:id="wal_tools">
+      <title>WAL Tools</title>
+
+      <section xml:id="hlog_tool">
+        <title><classname>HLog</classname> tool</title>
+
+        <para>The main method on <classname>HLog</classname> offers manual
+        split and dump facilities. Pass it WALs or the product of a split, the
+        content of the <filename>recovered.edits</filename>. directory.</para>
+
+        <para>You can get a textual dump of a WAL file content by doing the
+        following:<programlisting> <code>$ ./bin/hbase org.apache.hadoop.hbase.regionserver.wal.HLog --dump hdfs://example.org:8020/hbase/.logs/example.org,60020,1283516293161/10.10.21.10%3A60020.1283973724012</code> </programlisting>The
+        return code will be non-zero if issues with the file so you can test
+        wholesomeness of file by redirecting <varname>STDOUT</varname> to
+        <code>/dev/null</code> and testing the program return.</para>
+
+        <para>Similarily you can force a split of a log file directory by
+        doing:<programlisting> $ ./<code>bin/hbase org.apache.hadoop.hbase.regionserver.wal.HLog --split hdfs://example.org:8020/hbase/.logs/example.org,60020,1283516293161/</code></programlisting></para>
+      </section>
+    </section>
+    <section xml:id="compression.tool"><title>Compression Tool</title>
+        <para>See <xref linkend="compression.tool" />.</para>
+    </section>
+        <section xml:id="copytable">
+        <title>CopyTable</title>
+      <para>
+            CopyTable is a utility that can copy part or of all of a table, either to the same cluster or another cluster. The usage is as follows:
+<programlisting>$ bin/hbase org.apache.hadoop.hbase.mapreduce.CopyTable [--rs.class=CLASS] [--rs.impl=IMPL] [--starttime=X] [--endtime=Y] [--new.name=NEW] [--peer.adr=ADR] tablename
+</programlisting>
+        </para>
+        <para>
+        Options:
+        <itemizedlist>
+          <listitem><varname>rs.class</varname> hbase.regionserver.class of the peer cluster.  Specify if different from current cluster.</listitem>
+          <listitem><varname>rs.impl</varname>  hbase.regionserver.impl of the peer cluster. </listitem>
+          <listitem><varname>starttime</varname>  Beginning of the time range.  Without endtime means starttime to forever.</listitem>
+          <listitem><varname>endtime</varname>  End of the time range.  Without endtime means starttime to forever.</listitem>
+          <listitem><varname>new.name</varname>  New table's name.</listitem>
+          <listitem><varname>peer.adr</varname>  Address of the peer cluster given in the format hbase.zookeeper.quorum:hbase.zookeeper.client.port:zookeeper.znode.parent</listitem>
+          <listitem><varname>families</varname>  Comma-separated list of ColumnFamilies to copy.</listitem>
+        </itemizedlist>
+         Args:
+        <itemizedlist>
+          <listitem>tablename  Name of table to copy.</listitem>
+        </itemizedlist>
+        </para>
+        <para>Example of copying 'TestTable' to a cluster that uses replication for a 1 hour window:
+<programlisting>$ bin/hbase org.apache.hadoop.hbase.mapreduce.CopyTable
+--rs.class=org.apache.hadoop.hbase.ipc.ReplicationRegionInterface
+--rs.impl=org.apache.hadoop.hbase.regionserver.replication.ReplicationRegionServer
+--starttime=1265875194289 --endtime=1265878794289
+--peer.adr=server1,server2,server3:2181:/hbase TestTable</programlisting>
+        </para>
+    </section>       
+    </section>  <!--  tools -->
+    
+    <section xml:id="node.management"><title>Node Management</title>
+     <section xml:id="decommission"><title>Node Decommission</title>
+        <para>You can stop an individual RegionServer by running the following
+            script in the HBase directory on the particular  node:
+            <programlisting>$ ./bin/hbase-daemon.sh stop regionserver</programlisting>
+            The RegionServer will first close all regions and then shut itself down.
+            On shutdown, the RegionServer's ephemeral node in ZooKeeper will expire.
+            The master will notice the RegionServer gone and will treat it as
+            a 'crashed' server; it will reassign the nodes the RegionServer was carrying.
+            <note><title>Disable the Load Balancer before Decommissioning a node</title>
+             <para>If the load balancer runs while a node is shutting down, then
+                 there could be contention between the Load Balancer and the
+                 Master's recovery of the just decommissioned RegionServer.
+                 Avoid any problems by disabling the balancer first.
+                 See <xref linkend="lb" /> below.
+             </para>
+            </note>
+        </para>
+        <para>
+        A downside to the above stop of a RegionServer is that regions could be offline for
+        a good period of time.  Regions are closed in order.  If many regions on the server, the
+        first region to close may not be back online until all regions close and after the master
+        notices the RegionServer's znode gone.  In HBase 0.90.2, we added facility for having
+        a node gradually shed its load and then shutdown itself down.  HBase 0.90.2 added the
+            <filename>graceful_stop.sh</filename> script.  Here is its usage:
+            <programlisting>$ ./bin/graceful_stop.sh 
+Usage: graceful_stop.sh [--config &amp;conf-dir>] [--restart] [--reload] [--thrift] [--rest] &amp;hostname>
+ thrift      If we should stop/start thrift before/after the hbase stop/start
+ rest        If we should stop/start rest before/after the hbase stop/start
+ restart     If we should restart after graceful stop
+ reload      Move offloaded regions back on to the stopped server
+ debug       Move offloaded regions back on to the stopped server
+ hostname    Hostname of server we are to stop</programlisting>
+        </para>
+        <para>
+            To decommission a loaded RegionServer, run the following:
+            <programlisting>$ ./bin/graceful_stop.sh HOSTNAME</programlisting>
+            where <varname>HOSTNAME</varname> is the host carrying the RegionServer
+            you would decommission.  
+            <note><title>On <varname>HOSTNAME</varname></title>
+                <para>The <varname>HOSTNAME</varname> passed to <filename>graceful_stop.sh</filename>
+            must match the hostname that hbase is using to identify RegionServers.
+            Check the list of RegionServers in the master UI for how HBase is
+            referring to servers. Its usually hostname but can also be FQDN.
+            Whatever HBase is using, this is what you should pass the
+            <filename>graceful_stop.sh</filename> decommission
+            script.  If you pass IPs, the script is not yet smart enough to make
+            a hostname (or FQDN) of it and so it will fail when it checks if server is
+            currently running; the graceful unloading of regions will not run.
+            </para>
+        </note> The <filename>graceful_stop.sh</filename> script will move the regions off the
+            decommissioned RegionServer one at a time to minimize region churn.
+            It will verify the region deployed in the new location before it
+            will moves the next region and so on until the decommissioned server
+            is carrying zero regions.  At this point, the <filename>graceful_stop.sh</filename>
+            tells the RegionServer <command>stop</command>.  The master will at this point notice the
+            RegionServer gone but all regions will have already been redeployed
+            and because the RegionServer went down cleanly, there will be no
+            WAL logs to split.
+            <note xml:id="lb"><title>Load Balancer</title>
+            <para> 
+                It is assumed that the Region Load Balancer is disabled while the
+                <command>graceful_stop</command> script runs (otherwise the balancer
+                and the decommission script will end up fighting over region deployments).
+                Use the shell to disable the balancer:
+                <programlisting>hbase(main):001:0> balance_switch false
+true
+0 row(s) in 0.3590 seconds</programlisting>
+This turns the balancer OFF.  To reenable, do:
+                <programlisting>hbase(main):001:0> balance_switch true
+false
+0 row(s) in 0.3590 seconds</programlisting>
+            </para> 
+        </note>
+        </para>
+        </section>  
+        <section xml:id="rolling">
+            <title>Rolling Restart</title>
+        <para>
+            You can also ask this script to restart a RegionServer after the shutdown
+            AND move its old regions back into place.  The latter you might do to
+            retain data locality.  A primitive rolling restart might be effected by
+            running something like the following:
+            <programlisting>$ for i in `cat conf/regionservers|sort`; do ./bin/graceful_stop.sh --restart --reload --debug $i; done &amp;> /tmp/log.txt &amp;
+            </programlisting>
+            Tail the output of <filename>/tmp/log.txt</filename> to follow the scripts
+            progress. The above does RegionServers only.  Be sure to disable the
+            load balancer before doing the above.  You'd need to do the master
+            update separately.  Do it before you run the above script.
+            Here is a pseudo-script for how you might craft a rolling restart script:
+            <orderedlist>
+                <listitem><para>Untar your release, make sure of its configuration and
+                        then rsync it across the cluster. If this is 0.90.2, patch it
+                        with HBASE-3744 and HBASE-3756.
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>Run hbck to ensure the cluster consistent
+                        <programlisting>$ ./bin/hbase hbck</programlisting>
+                    Effect repairs if inconsistent.
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>Restart the Master: <programlisting>$ ./bin/hbase-daemon.sh stop master; ./bin/hbase-daemon.sh start master</programlisting>
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>
+                       Disable the region balancer:<programlisting>$ echo "balance_switch false" | ./bin/hbase shell</programlisting>
+                    </para>
+                </listitem>
+                <listitem>
+                     <para>Run the <filename>graceful_stop.sh</filename> script per RegionServer.  For example:
+            <programlisting>$ for i in `cat conf/regionservers|sort`; do ./bin/graceful_stop.sh --restart --reload --debug $i; done &amp;> /tmp/log.txt &amp;
+            </programlisting>
+                     If you are running thrift or rest servers on the RegionServer, pass --thrift or --rest options (See usage
+                     for <filename>graceful_stop.sh</filename> script).
+                 </para>
+                </listitem>
+                <listitem>
+                    <para>Restart the Master again.  This will clear out dead servers list and reenable the balancer.
+                    </para>
+                </listitem>
+                <listitem>
+                    <para>Run hbck to ensure the cluster is consistent.
+                    </para>
+                </listitem>
+            </orderedlist>
+        </para>
+    </section>
+    </section>  <!--  node mgt -->
+
+  <section xml:id="ops.monitoring">
+    <title >HBase Monitoring</title>
+    <para>TODO
+    </para>
+  </section>
+  <section xml:id="ops.backup">
+    <title >HBase Backup</title>
+    <para>See <link xlink:href="http://blog.sematext.com/2011/03/11/hbase-backup-options/">HBase Backup Options</link> over on the Sematext Blog.
+    </para>
+  </section>
+
+</chapter>