HBASE-3071 Graceful decommissioning of a regionserver
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1088879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b9ee682e70
commit
06a21879a4
|
@ -134,6 +134,7 @@ Release 0.91.0 - Unreleased
|
|||
HBASE-3559 Move report of split to master OFF the heartbeat channel
|
||||
HBASE-3573 Move shutdown messaging OFF hearbeat; prereq for fix of
|
||||
hbase-1502
|
||||
HBASE-3071 Graceful decommissioning of a regionserver
|
||||
|
||||
|
||||
NEW FEATURES
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
#/**
|
||||
# * Copyright 2011 The Apache Software Foundation
|
||||
# *
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one
|
||||
# * or more contributor license agreements. See the NOTICE file
|
||||
# * distributed with this work for additional information
|
||||
# * regarding copyright ownership. The ASF licenses this file
|
||||
# * to you under the Apache License, Version 2.0 (the
|
||||
# * "License"); you may not use this file except in compliance
|
||||
# * with the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
|
||||
# Move regions off a server then stop it. Optionally restart and reload.
|
||||
# Turn off the balancer before running this script.
|
||||
function usage {
|
||||
echo "Usage: graceful_stop.sh [--config <conf-dir>] [--restart] [--reload] <hostname>"
|
||||
echo " restart If we should restart after graceful stop"
|
||||
echo " reload Move offloaded regions back on to the stopped server"
|
||||
echo " debug Move offloaded regions back on to the stopped server"
|
||||
echo " hostname Hostname of server we are to stop"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
bin=`dirname "$0"`
|
||||
bin=`cd "$bin">/dev/null; pwd`
|
||||
# This will set HBASE_HOME, etc.
|
||||
. "$bin"/hbase-config.sh
|
||||
# Get arguments
|
||||
restart=
|
||||
reload=
|
||||
debug=
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case "$1" in
|
||||
--restart) restart=true; shift;;
|
||||
--reload) reload=true; shift;;
|
||||
--debug) debug="--debug"; shift;;
|
||||
--) shift; break;;
|
||||
-*) usage ;;
|
||||
*) break;; # terminate while loop
|
||||
esac
|
||||
done
|
||||
|
||||
# "$@" contains the rest. Must be at least the hostname left.
|
||||
if [ $# -lt 1 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
hostname=$1
|
||||
filename="/tmp/$hostname"
|
||||
# Run the region mover script.
|
||||
echo "Unloading $hostname region(s)"
|
||||
HBASE_NOEXEC=true "$bin"/hbase org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug unload $hostname
|
||||
echo "Unloaded $hostname region(s)"
|
||||
# Stop the server. Have to put hostname into its own little file for hbase-daemons.sh
|
||||
hosts="/tmp/$(basename $0).$$.tmp"
|
||||
echo $hostname >> $hosts
|
||||
"$bin"/hbase-daemons.sh --hosts ${hosts} stop regionserver
|
||||
if [ "$restart" != "" ]; then
|
||||
"$bin"/hbase-daemons.sh --hosts ${hosts} start regionserver
|
||||
if [ "$reload" != "" ]; then
|
||||
echo "Reloading $hostname region(s)"
|
||||
HBASE_NOEXEC=true "$bin"/hbase org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug load $hostname
|
||||
echo "Reloaded $hostname region(s)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Cleanup tmp files.
|
||||
trap "rm -f "/tmp/$(basename $0).*.tmp" &> /dev/null" EXIT
|
|
@ -276,5 +276,9 @@ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
|
|||
HBASE_OPTS="$HBASE_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
|
||||
fi
|
||||
|
||||
# run it
|
||||
# Exec unless HBASE_NOEXEC is set.
|
||||
if [ "${HBASE_NOEXEC}" != "" ]; then
|
||||
"$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@"
|
||||
else
|
||||
exec "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@"
|
||||
fi
|
||||
|
|
|
@ -38,7 +38,7 @@ bin=`cd "$bin">/dev/null; pwd`
|
|||
. $bin/hbase-config.sh
|
||||
|
||||
remote_cmd="cd ${HBASE_HOME}; $bin/hbase-daemon.sh --config ${HBASE_CONF_DIR} $@"
|
||||
args="--config ${HBASE_CONF_DIR} $remote_cmd"
|
||||
args="--hosts ${HBASE_REGIONSERVERS} --config ${HBASE_CONF_DIR} $remote_cmd"
|
||||
|
||||
command=$2
|
||||
case $command in
|
||||
|
|
|
@ -0,0 +1,434 @@
|
|||
# Copyright 2011 The Apache Software Foundation
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Moves regions. Will confirm region access in current location and will
|
||||
# not move a new region until successful confirm of region loading in new
|
||||
# location. Presumes balancer is disabled when we run (not harmful if its
|
||||
# on but this script and balancer will end up fighting each other).
|
||||
# Does not work for case of multiple regionservers all running on the
|
||||
# one node.
|
||||
require 'optparse'
|
||||
include Java
|
||||
import org.apache.hadoop.hbase.HConstants
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration
|
||||
import org.apache.hadoop.hbase.client.HBaseAdmin
|
||||
import org.apache.hadoop.hbase.client.Get
|
||||
import org.apache.hadoop.hbase.client.Scan
|
||||
import org.apache.hadoop.hbase.client.HTable
|
||||
import org.apache.hadoop.hbase.client.HConnectionManager
|
||||
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
|
||||
import org.apache.hadoop.hbase.HServerAddress
|
||||
import org.apache.hadoop.hbase.util.Bytes
|
||||
import org.apache.hadoop.hbase.util.Writables
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.commons.logging.Log
|
||||
import org.apache.commons.logging.LogFactory
|
||||
|
||||
# Name of this script
|
||||
NAME = "region_mover"
|
||||
|
||||
# Get root table reference
|
||||
def getRootTable(config)
|
||||
# Keep meta reference in ruby global
|
||||
if not $ROOT
|
||||
$ROOT = HTable.new(config, HConstants::ROOT_TABLE_NAME)
|
||||
end
|
||||
return $ROOT
|
||||
end
|
||||
|
||||
# Get meta table reference
|
||||
def getMetaTable(config)
|
||||
# Keep meta reference in ruby global
|
||||
if not $META
|
||||
$META = HTable.new(config, HConstants::META_TABLE_NAME)
|
||||
end
|
||||
return $META
|
||||
end
|
||||
|
||||
# Get table instance.
|
||||
# Maintains cache of table instances.
|
||||
def getTable(config, name)
|
||||
# Keep dictionary of tables in ruby global
|
||||
if not $TABLES
|
||||
$TABLES = {}
|
||||
end
|
||||
key = Bytes.toString(name)
|
||||
if not $TABLES[key]
|
||||
$TABLES[key] = HTable.new(config, name)
|
||||
end
|
||||
return $TABLES[key]
|
||||
end
|
||||
|
||||
|
||||
# Returns true if passed region is still on 'original' when we look at .META.
|
||||
def isSameServer(admin, r, original)
|
||||
server = getServerNameForRegion(admin, r)
|
||||
return false unless server
|
||||
return true unless original
|
||||
return server == original
|
||||
end
|
||||
|
||||
class RubyAbortable
|
||||
include org.apache.hadoop.hbase.Abortable
|
||||
def abort(why, e)
|
||||
puts "ABORTED! why=" + why + ", e=" + e.to_s
|
||||
end
|
||||
end
|
||||
|
||||
# Get servername that is up in .META.; this is hostname + port + startcode comma-delimited.
|
||||
# Can return nil
|
||||
def getServerNameForRegion(admin, r)
|
||||
if r.isRootRegion()
|
||||
# Hack
|
||||
tracker = org.apache.hadoop.hbase.zookeeper.RootRegionTracker.new(admin.getConnection().getZooKeeperWatcher(), RubyAbortable.new())
|
||||
tracker.start()
|
||||
while not tracker.isLocationAvailable()
|
||||
sleep 0.1
|
||||
end
|
||||
# Make a fake servername by appending ','
|
||||
rootServer = tracker.getRootRegionLocation().toString() + ","
|
||||
tracker.stop()
|
||||
return rootServer
|
||||
end
|
||||
table = nil
|
||||
if r.isMetaRegion()
|
||||
table = getRootTable(admin.getConfiguration())
|
||||
else
|
||||
table = getMetaTable(admin.getConfiguration())
|
||||
end
|
||||
g = Get.new(r.getRegionName())
|
||||
g.addColumn(HConstants::CATALOG_FAMILY, HConstants::SERVER_QUALIFIER)
|
||||
g.addColumn(HConstants::CATALOG_FAMILY, HConstants::STARTCODE_QUALIFIER)
|
||||
result = table.get(g)
|
||||
server = result.getValue(HConstants::CATALOG_FAMILY, HConstants::SERVER_QUALIFIER)
|
||||
startcode = result.getValue(HConstants::CATALOG_FAMILY, HConstants::STARTCODE_QUALIFIER)
|
||||
return nil unless server
|
||||
return java.lang.String.new(Bytes.toString(server)).replaceFirst(":", ",") + "," + Bytes.toLong(startcode).to_s
|
||||
end
|
||||
|
||||
# Trys to scan a row from passed region
|
||||
# Throws exception if can't
|
||||
def isSuccessfulScan(admin, r)
|
||||
scan = Scan.new(r.getStartKey())
|
||||
scan.setBatch(1)
|
||||
scan.setCaching(1)
|
||||
scan.setFilter(FirstKeyOnlyFilter.new())
|
||||
table = getTable(admin.getConfiguration(), r.getTableDesc().getName())
|
||||
scanner = table.getScanner(scan)
|
||||
begin
|
||||
results = scanner.next()
|
||||
# We might scan into next region, this might be an empty table.
|
||||
# But if no exception, presume scanning is working.
|
||||
ensure
|
||||
scanner.close()
|
||||
table.close()
|
||||
end
|
||||
end
|
||||
|
||||
# Check region has moved successful and is indeed hosted on another server
|
||||
# Wait until that is the case.
|
||||
def move(admin, r, newServer, original)
|
||||
# Now move it. Do it in a loop so can retry if fail. Have seen issue where
|
||||
# we tried move region but failed and retry put it back on old location;
|
||||
# retry in this case.
|
||||
retries = admin.getConfiguration.getInt("hbase.move.retries.max", 5)
|
||||
count = 0
|
||||
same = true
|
||||
while count < retries and same
|
||||
if count > 0
|
||||
$LOG.info("Retry " + count.to_s + " of maximum " + retries.to_s)
|
||||
end
|
||||
count = count + 1
|
||||
begin
|
||||
admin.move(Bytes.toBytes(r.getEncodedName()), Bytes.toBytes(newServer))
|
||||
rescue java.lang.reflect.UndeclaredThrowableException => e
|
||||
$LOG.info("Exception moving " + r.getEncodedName() +
|
||||
"; split/moved? Continuing: " + e)
|
||||
return
|
||||
end
|
||||
# Wait till its up on new server before moving on
|
||||
maxWaitInSeconds = admin.getConfiguration.getInt("hbase.move.wait.max", 60)
|
||||
maxWait = Time.now + maxWaitInSeconds
|
||||
while Time.now < maxWait
|
||||
same = isSameServer(admin, r, original)
|
||||
break unless same
|
||||
sleep 0.1
|
||||
end
|
||||
end
|
||||
raise RuntimeError, "Region stuck on #{original}, newserver=#{newServer}" if same
|
||||
# Assert can Scan from new location.
|
||||
isSuccessfulScan(admin, r)
|
||||
end
|
||||
|
||||
# Return the hostname portion of a servername (all up to first ',')
|
||||
def getHostnamePortFromServerName(serverName)
|
||||
parts = serverName.split(',')
|
||||
return parts[0] + ":" + parts[1]
|
||||
end
|
||||
|
||||
# Return the hostname:port out of a servername (all up to first ',')
|
||||
def getHostnameFromServerName(serverName)
|
||||
return serverName.split(',')[0]
|
||||
end
|
||||
|
||||
# Return array of servernames where servername is hostname+port+startcode
|
||||
# comma-delimited
|
||||
def getServers(admin)
|
||||
serverInfos = admin.getClusterStatus().getServerInfo()
|
||||
servers = []
|
||||
for server in serverInfos
|
||||
servers << server.getServerName()
|
||||
end
|
||||
return servers
|
||||
end
|
||||
|
||||
# Remove the servername whose hostname portion matches from the passed
|
||||
# array of servers. Returns as side-effect the servername removed.
|
||||
def stripServer(servers, hostname)
|
||||
count = servers.length
|
||||
servername = nil
|
||||
for server in servers
|
||||
if getHostnameFromServerName(server) == hostname
|
||||
servername = servers.delete(server)
|
||||
end
|
||||
end
|
||||
# Check server to exclude is actually present
|
||||
raise RuntimeError, "Server %s not online" % hostname unless servers.length < count
|
||||
return servername
|
||||
end
|
||||
|
||||
# Return servername that matches passed hostname
|
||||
def getServerName(servers, hostname)
|
||||
servername = nil
|
||||
for server in servers
|
||||
if getHostnameFromServerName(server) == hostname
|
||||
servername = server
|
||||
break
|
||||
end
|
||||
end
|
||||
raise ArgumentError, "Server %s not online" % hostname unless servername
|
||||
return servername
|
||||
end
|
||||
|
||||
# Create a logger and disable the DEBUG-level annoying client logging
|
||||
def configureLogging(options)
|
||||
apacheLogger = LogFactory.getLog(NAME)
|
||||
# Configure log4j to not spew so much
|
||||
unless (options[:debug])
|
||||
logger = org.apache.log4j.Logger.getLogger("org.apache.hadoop.hbase.client")
|
||||
logger.setLevel(org.apache.log4j.Level::INFO)
|
||||
end
|
||||
return apacheLogger
|
||||
end
|
||||
|
||||
# Get configuration instance
|
||||
def getConfiguration()
|
||||
config = HBaseConfiguration.create()
|
||||
# No prefetching on .META.
|
||||
config.setInt("hbase.client.prefetch.limit", 1)
|
||||
# Make a config that retries at short intervals many times
|
||||
config.setInt("hbase.client.pause", 500)
|
||||
config.setInt("hbase.client.retries.number", 100)
|
||||
return config
|
||||
end
|
||||
|
||||
# Now get list of regions on targetServer
|
||||
def getRegions(config, servername)
|
||||
connection = HConnectionManager::getConnection(config)
|
||||
hsa = HServerAddress.new(getHostnamePortFromServerName(servername))
|
||||
rs = connection.getHRegionConnection(hsa)
|
||||
return rs.getOnlineRegions()
|
||||
end
|
||||
|
||||
def deleteFile(filename)
|
||||
f = java.io.File.new(filename)
|
||||
f.delete() if f.exists()
|
||||
end
|
||||
|
||||
# Write HRegionInfo to file
|
||||
# Need to serialize in case non-printable characters.
|
||||
# Format is count of regionnames followed by serialized regionnames.
|
||||
def writeFile(filename, regions)
|
||||
fos = java.io.FileOutputStream.new(filename)
|
||||
dos = java.io.DataOutputStream.new(fos)
|
||||
# Write out a count of region names
|
||||
dos.writeInt(regions.size())
|
||||
# Write actual region names.
|
||||
for r in regions
|
||||
bytes = Writables.getBytes(r)
|
||||
Bytes.writeByteArray(dos, bytes)
|
||||
end
|
||||
dos.close()
|
||||
end
|
||||
|
||||
# See writeFile above.
|
||||
# Returns array of HRegionInfos
|
||||
def readFile(filename)
|
||||
f = java.io.File.new(filename)
|
||||
return java.util.ArrayList.new() unless f.exists()
|
||||
fis = java.io.FileInputStream.new(f)
|
||||
dis = java.io.DataInputStream.new(fis)
|
||||
# Read count of regions
|
||||
count = dis.readInt()
|
||||
regions = java.util.ArrayList.new(count)
|
||||
index = 0
|
||||
while index < count
|
||||
regions.add(Writables.getHRegionInfo(Bytes.readByteArray(dis)))
|
||||
index = index + 1
|
||||
end
|
||||
dis.close()
|
||||
return regions
|
||||
end
|
||||
|
||||
# Move regions off the passed hostname
|
||||
def unloadRegions(options, hostname)
|
||||
# Get configuration
|
||||
config = getConfiguration()
|
||||
# Clean up any old files.
|
||||
filename = getFilename(options, hostname)
|
||||
deleteFile(filename)
|
||||
# Get an admin instance
|
||||
admin = HBaseAdmin.new(config)
|
||||
servers = getServers(admin)
|
||||
# Remove the server we are unloading from from list of servers.
|
||||
# Side-effect is the servername that matches this hostname
|
||||
servername = stripServer(servers, hostname)
|
||||
movedRegions = java.util.ArrayList.new()
|
||||
while true
|
||||
rs = getRegions(config, servername)
|
||||
break if rs.length == 0
|
||||
count = 0
|
||||
$LOG.info("Moving " + rs.length.to_s + " region(s) from " + servername +
|
||||
" during this cycle");
|
||||
for r in rs
|
||||
# Get a random server to move the region to.
|
||||
server = servers[rand(servers.length)]
|
||||
$LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s +
|
||||
" of " + rs.length.to_s + ") to server=" + server);
|
||||
count = count + 1
|
||||
# Assert we can scan region in its current location
|
||||
isSuccessfulScan(admin, r)
|
||||
# Now move it.
|
||||
move(admin, r, server, servername)
|
||||
movedRegions.add(r)
|
||||
end
|
||||
end
|
||||
if movedRegions.size() > 0
|
||||
# Write out file of regions moved
|
||||
writeFile(filename, movedRegions)
|
||||
$LOG.info("Wrote list of moved regions to " + filename)
|
||||
end
|
||||
end
|
||||
|
||||
# Move regions to the passed hostname
|
||||
def loadRegions(options, hostname)
|
||||
# Get configuration
|
||||
config = getConfiguration()
|
||||
# Get an admin instance
|
||||
admin = HBaseAdmin.new(config)
|
||||
filename = getFilename(options, hostname)
|
||||
regions = readFile(filename)
|
||||
return if regions.isEmpty()
|
||||
servername = nil
|
||||
# Wait till server is up
|
||||
maxWaitInSeconds = admin.getConfiguration.getInt("hbase.serverstart.wait.max", 180)
|
||||
maxWait = Time.now + maxWaitInSeconds
|
||||
while Time.now < maxWait
|
||||
servers = getServers(admin)
|
||||
begin
|
||||
servername = getServerName(servers, hostname)
|
||||
rescue ArgumentError => e
|
||||
$LOG.info("hostname=" + hostname.to_s + " is not up yet, waiting");
|
||||
end
|
||||
break if servername
|
||||
sleep 0.5
|
||||
end
|
||||
$LOG.info("Moving " + regions.size().to_s + " regions to " + servername)
|
||||
count = 0
|
||||
for r in regions
|
||||
exists = false
|
||||
begin
|
||||
exists = isSuccessfulScan(admin, r)
|
||||
rescue org.apache.hadoop.hbase.NotServingRegionException => e
|
||||
$LOG.info("Failed scan of " + e.message)
|
||||
end
|
||||
count = count + 1
|
||||
next unless exists
|
||||
currentServer = getServerNameForRegion(admin, r)
|
||||
if currentServer and currentServer == servername
|
||||
$LOG.info("Region " + r.getRegionNameAsString() + " (" + count.to_s +
|
||||
" of " + regions.length.to_s + ") already on target server=" + servername)
|
||||
next
|
||||
end
|
||||
$LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s +
|
||||
" of " + regions.length.to_s + ") to server=" + servername);
|
||||
move(admin, r, servername, currentServer)
|
||||
end
|
||||
end
|
||||
|
||||
def getFilename(options, targetServer)
|
||||
filename = options[:file]
|
||||
if not filename
|
||||
filename = "/tmp/" + targetServer
|
||||
end
|
||||
return filename
|
||||
end
|
||||
|
||||
|
||||
# Do command-line parsing
|
||||
options = {}
|
||||
optparse = OptionParser.new do |opts|
|
||||
opts.banner = "Usage: #{NAME}.rb [options] load|unload <hostname>"
|
||||
opts.separator 'Load or unload regions by moving one at a time'
|
||||
options[:file] = nil
|
||||
opts.on('-f', '--filename=FILE', 'File to save regions list into unloading, or read from loading; default /tmp/<hostname>') do |file|
|
||||
options[:file] = file
|
||||
end
|
||||
opts.on('-h', '--help', 'Display usage information') do
|
||||
puts opts
|
||||
exit
|
||||
end
|
||||
options[:debug] = false
|
||||
opts.on('-d', '--debug', 'Display extra debug logging') do
|
||||
options[:debug] = true
|
||||
end
|
||||
end
|
||||
optparse.parse!
|
||||
|
||||
# Check ARGVs
|
||||
if ARGV.length < 2
|
||||
puts optparse
|
||||
exit 1
|
||||
end
|
||||
hostname = ARGV[1]
|
||||
if not hostname
|
||||
opts optparse
|
||||
exit 2
|
||||
end
|
||||
# Create a logger and save it to ruby global
|
||||
$LOG = configureLogging(options)
|
||||
case ARGV[0]
|
||||
when 'load'
|
||||
loadRegions(options, hostname)
|
||||
when 'unload'
|
||||
unloadRegions(options, hostname)
|
||||
else
|
||||
puts optparse
|
||||
exit 3
|
||||
end
|
|
@ -1076,6 +1076,56 @@ public static byte[][] getHexSplits(String startKey, String endKey, int numRegio
|
|||
<section xml:id="compression.tool"><title>Compression Tool</title>
|
||||
<para>See <link linkend="compression.tool" >Compression Tool</link>.</para>
|
||||
</section>
|
||||
<section xml:id="decommission"><title>Node Decommission</title>
|
||||
<para>You can have a node gradually shed its load and then shutdown using the
|
||||
<command>graceful_restart.sh</command> script. Here is its usage:
|
||||
<computeroutput>$ ./bin/graceful_stop.sh
|
||||
Usage: graceful_stop.sh [--config &conf-dir>] [--restart] [--reload] &hostname>
|
||||
restart If we should restart after graceful stop
|
||||
reload Move offloaded regions back on to the stopped server
|
||||
debug Move offloaded regions back on to the stopped server
|
||||
hostname Hostname of server we are to stop</computeroutput>
|
||||
</para>
|
||||
<para>
|
||||
To decommission a loaded regionserver, run the following:
|
||||
<programlisting>$ ./bin/graceful_stop.sh HOSTNAME</programlisting>
|
||||
where <varname>HOSTNAME</varname> is the host carrying the RegionServer
|
||||
you would decommission. The script will move the regions off the
|
||||
decommissioned regionserver one at a time to minimize region churn.
|
||||
It will verify the region deployed in the new location before it
|
||||
will moves the next region and so on until the decommissioned server
|
||||
is carrying zero regions. At this point, the <command>graceful_stop</command>
|
||||
tells the RegionServer stop. The master will at this point notice the
|
||||
RegionServer gone but all regions will have already been redeployed
|
||||
and because the RegionServer went down cleanly, there will be no
|
||||
WAL logs to split.
|
||||
<note><title>Load Balancer</title>
|
||||
<para>
|
||||
It is assumed that the Region Load Balancer is disabled while the
|
||||
<command>graceful_stop</command> script runs (otherwise the balancer
|
||||
and the decommission script will end up fighting over region deployments).
|
||||
Use the shell to disable the balancer:
|
||||
<programlisting>hbase(main):001:0> balance_switch false
|
||||
true
|
||||
0 row(s) in 0.3590 seconds</programlisting>
|
||||
This turns the balancer OFF. To reenable, do:
|
||||
<programlisting>hbase(main):001:0> balance_switch true
|
||||
false
|
||||
0 row(s) in 0.3590 seconds</programlisting>
|
||||
</para>
|
||||
</note>
|
||||
</para>
|
||||
<para>
|
||||
You can also ask this script to restart a RegionServer after the shutdown
|
||||
AND move its old regions back into place. The latter you might do to
|
||||
retain data locality. A primitive rolling restart might be effected by
|
||||
running something like the following:
|
||||
<programlisting>$ for i in `cat conf/regionservers|sort`; do ./bin/graceful_stop.sh --restart --reload --debug $i; done &> /tmp/log.txt &
|
||||
</programlisting>
|
||||
Tail the output of <filename>/tmp/log.txt</filename> to follow the scripts
|
||||
progress.
|
||||
</para>
|
||||
</section>
|
||||
</appendix>
|
||||
|
||||
<appendix xml:id="compression">
|
||||
|
|
Loading…
Reference in New Issue