From 06a21879a45d6af230d01c02d088e9a374d2753f Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Tue, 5 Apr 2011 04:08:24 +0000 Subject: [PATCH] HBASE-3071 Graceful decommissioning of a regionserver git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1088879 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + bin/graceful_stop.sh | 83 +++++++++ bin/hbase | 8 +- bin/hbase-daemons.sh | 2 +- bin/region_mover.rb | 434 +++++++++++++++++++++++++++++++++++++++++++ src/docbkx/book.xml | 50 +++++ 6 files changed, 575 insertions(+), 3 deletions(-) create mode 100644 bin/graceful_stop.sh create mode 100644 bin/region_mover.rb diff --git a/CHANGES.txt b/CHANGES.txt index 47b9924d6e1..002ad1cf462 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -134,6 +134,7 @@ Release 0.91.0 - Unreleased HBASE-3559 Move report of split to master OFF the heartbeat channel HBASE-3573 Move shutdown messaging OFF hearbeat; prereq for fix of hbase-1502 + HBASE-3071 Graceful decommissioning of a regionserver NEW FEATURES diff --git a/bin/graceful_stop.sh b/bin/graceful_stop.sh new file mode 100644 index 00000000000..d51b86f9e9e --- /dev/null +++ b/bin/graceful_stop.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# +#/** +# * Copyright 2011 The Apache Software Foundation +# * +# * Licensed to the Apache Software Foundation (ASF) under one +# * or more contributor license agreements. See the NOTICE file +# * distributed with this work for additional information +# * regarding copyright ownership. The ASF licenses this file +# * to you under the Apache License, Version 2.0 (the +# * "License"); you may not use this file except in compliance +# * with the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ + +# Move regions off a server then stop it. Optionally restart and reload. +# Turn off the balancer before running this script. +function usage { + echo "Usage: graceful_stop.sh [--config ] [--restart] [--reload] " + echo " restart If we should restart after graceful stop" + echo " reload Move offloaded regions back on to the stopped server" + echo " debug Move offloaded regions back on to the stopped server" + echo " hostname Hostname of server we are to stop" + exit 1 +} + +if [ $# -lt 1 ]; then + usage +fi + +bin=`dirname "$0"` +bin=`cd "$bin">/dev/null; pwd` +# This will set HBASE_HOME, etc. +. "$bin"/hbase-config.sh +# Get arguments +restart= +reload= +debug= +while [ $# -gt 0 ] +do + case "$1" in + --restart) restart=true; shift;; + --reload) reload=true; shift;; + --debug) debug="--debug"; shift;; + --) shift; break;; + -*) usage ;; + *) break;; # terminate while loop + esac +done + +# "$@" contains the rest. Must be at least the hostname left. +if [ $# -lt 1 ]; then + usage +fi + +hostname=$1 +filename="/tmp/$hostname" +# Run the region mover script. +echo "Unloading $hostname region(s)" +HBASE_NOEXEC=true "$bin"/hbase org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug unload $hostname +echo "Unloaded $hostname region(s)" +# Stop the server. Have to put hostname into its own little file for hbase-daemons.sh +hosts="/tmp/$(basename $0).$$.tmp" +echo $hostname >> $hosts +"$bin"/hbase-daemons.sh --hosts ${hosts} stop regionserver +if [ "$restart" != "" ]; then + "$bin"/hbase-daemons.sh --hosts ${hosts} start regionserver + if [ "$reload" != "" ]; then + echo "Reloading $hostname region(s)" + HBASE_NOEXEC=true "$bin"/hbase org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug load $hostname + echo "Reloaded $hostname region(s)" + fi +fi + +# Cleanup tmp files. +trap "rm -f "/tmp/$(basename $0).*.tmp" &> /dev/null" EXIT diff --git a/bin/hbase b/bin/hbase index 674dfa36ac3..aac8da6186c 100755 --- a/bin/hbase +++ b/bin/hbase @@ -276,5 +276,9 @@ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then HBASE_OPTS="$HBASE_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" fi -# run it -exec "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@" +# Exec unless HBASE_NOEXEC is set. +if [ "${HBASE_NOEXEC}" != "" ]; then + "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@" +else + exec "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@" +fi diff --git a/bin/hbase-daemons.sh b/bin/hbase-daemons.sh index 6a447554bc9..843eaaa74ff 100755 --- a/bin/hbase-daemons.sh +++ b/bin/hbase-daemons.sh @@ -38,7 +38,7 @@ bin=`cd "$bin">/dev/null; pwd` . $bin/hbase-config.sh remote_cmd="cd ${HBASE_HOME}; $bin/hbase-daemon.sh --config ${HBASE_CONF_DIR} $@" -args="--config ${HBASE_CONF_DIR} $remote_cmd" +args="--hosts ${HBASE_REGIONSERVERS} --config ${HBASE_CONF_DIR} $remote_cmd" command=$2 case $command in diff --git a/bin/region_mover.rb b/bin/region_mover.rb new file mode 100644 index 00000000000..87421903364 --- /dev/null +++ b/bin/region_mover.rb @@ -0,0 +1,434 @@ +# Copyright 2011 The Apache Software Foundation +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Moves regions. Will confirm region access in current location and will +# not move a new region until successful confirm of region loading in new +# location. Presumes balancer is disabled when we run (not harmful if its +# on but this script and balancer will end up fighting each other). +# Does not work for case of multiple regionservers all running on the +# one node. +require 'optparse' +include Java +import org.apache.hadoop.hbase.HConstants +import org.apache.hadoop.hbase.HBaseConfiguration +import org.apache.hadoop.hbase.client.HBaseAdmin +import org.apache.hadoop.hbase.client.Get +import org.apache.hadoop.hbase.client.Scan +import org.apache.hadoop.hbase.client.HTable +import org.apache.hadoop.hbase.client.HConnectionManager +import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter; +import org.apache.hadoop.hbase.HServerAddress +import org.apache.hadoop.hbase.util.Bytes +import org.apache.hadoop.hbase.util.Writables +import org.apache.hadoop.conf.Configuration +import org.apache.commons.logging.Log +import org.apache.commons.logging.LogFactory + +# Name of this script +NAME = "region_mover" + +# Get root table reference +def getRootTable(config) + # Keep meta reference in ruby global + if not $ROOT + $ROOT = HTable.new(config, HConstants::ROOT_TABLE_NAME) + end + return $ROOT +end + +# Get meta table reference +def getMetaTable(config) + # Keep meta reference in ruby global + if not $META + $META = HTable.new(config, HConstants::META_TABLE_NAME) + end + return $META +end + +# Get table instance. +# Maintains cache of table instances. +def getTable(config, name) + # Keep dictionary of tables in ruby global + if not $TABLES + $TABLES = {} + end + key = Bytes.toString(name) + if not $TABLES[key] + $TABLES[key] = HTable.new(config, name) + end + return $TABLES[key] +end + + +# Returns true if passed region is still on 'original' when we look at .META. +def isSameServer(admin, r, original) + server = getServerNameForRegion(admin, r) + return false unless server + return true unless original + return server == original +end + +class RubyAbortable + include org.apache.hadoop.hbase.Abortable + def abort(why, e) + puts "ABORTED! why=" + why + ", e=" + e.to_s + end +end + +# Get servername that is up in .META.; this is hostname + port + startcode comma-delimited. +# Can return nil +def getServerNameForRegion(admin, r) + if r.isRootRegion() + # Hack + tracker = org.apache.hadoop.hbase.zookeeper.RootRegionTracker.new(admin.getConnection().getZooKeeperWatcher(), RubyAbortable.new()) + tracker.start() + while not tracker.isLocationAvailable() + sleep 0.1 + end + # Make a fake servername by appending ',' + rootServer = tracker.getRootRegionLocation().toString() + "," + tracker.stop() + return rootServer + end + table = nil + if r.isMetaRegion() + table = getRootTable(admin.getConfiguration()) + else + table = getMetaTable(admin.getConfiguration()) + end + g = Get.new(r.getRegionName()) + g.addColumn(HConstants::CATALOG_FAMILY, HConstants::SERVER_QUALIFIER) + g.addColumn(HConstants::CATALOG_FAMILY, HConstants::STARTCODE_QUALIFIER) + result = table.get(g) + server = result.getValue(HConstants::CATALOG_FAMILY, HConstants::SERVER_QUALIFIER) + startcode = result.getValue(HConstants::CATALOG_FAMILY, HConstants::STARTCODE_QUALIFIER) + return nil unless server + return java.lang.String.new(Bytes.toString(server)).replaceFirst(":", ",") + "," + Bytes.toLong(startcode).to_s +end + +# Trys to scan a row from passed region +# Throws exception if can't +def isSuccessfulScan(admin, r) + scan = Scan.new(r.getStartKey()) + scan.setBatch(1) + scan.setCaching(1) + scan.setFilter(FirstKeyOnlyFilter.new()) + table = getTable(admin.getConfiguration(), r.getTableDesc().getName()) + scanner = table.getScanner(scan) + begin + results = scanner.next() + # We might scan into next region, this might be an empty table. + # But if no exception, presume scanning is working. + ensure + scanner.close() + table.close() + end +end + +# Check region has moved successful and is indeed hosted on another server +# Wait until that is the case. +def move(admin, r, newServer, original) + # Now move it. Do it in a loop so can retry if fail. Have seen issue where + # we tried move region but failed and retry put it back on old location; + # retry in this case. + retries = admin.getConfiguration.getInt("hbase.move.retries.max", 5) + count = 0 + same = true + while count < retries and same + if count > 0 + $LOG.info("Retry " + count.to_s + " of maximum " + retries.to_s) + end + count = count + 1 + begin + admin.move(Bytes.toBytes(r.getEncodedName()), Bytes.toBytes(newServer)) + rescue java.lang.reflect.UndeclaredThrowableException => e + $LOG.info("Exception moving " + r.getEncodedName() + + "; split/moved? Continuing: " + e) + return + end + # Wait till its up on new server before moving on + maxWaitInSeconds = admin.getConfiguration.getInt("hbase.move.wait.max", 60) + maxWait = Time.now + maxWaitInSeconds + while Time.now < maxWait + same = isSameServer(admin, r, original) + break unless same + sleep 0.1 + end + end + raise RuntimeError, "Region stuck on #{original}, newserver=#{newServer}" if same + # Assert can Scan from new location. + isSuccessfulScan(admin, r) +end + +# Return the hostname portion of a servername (all up to first ',') +def getHostnamePortFromServerName(serverName) + parts = serverName.split(',') + return parts[0] + ":" + parts[1] +end + +# Return the hostname:port out of a servername (all up to first ',') +def getHostnameFromServerName(serverName) + return serverName.split(',')[0] +end + +# Return array of servernames where servername is hostname+port+startcode +# comma-delimited +def getServers(admin) + serverInfos = admin.getClusterStatus().getServerInfo() + servers = [] + for server in serverInfos + servers << server.getServerName() + end + return servers +end + +# Remove the servername whose hostname portion matches from the passed +# array of servers. Returns as side-effect the servername removed. +def stripServer(servers, hostname) + count = servers.length + servername = nil + for server in servers + if getHostnameFromServerName(server) == hostname + servername = servers.delete(server) + end + end + # Check server to exclude is actually present + raise RuntimeError, "Server %s not online" % hostname unless servers.length < count + return servername +end + +# Return servername that matches passed hostname +def getServerName(servers, hostname) + servername = nil + for server in servers + if getHostnameFromServerName(server) == hostname + servername = server + break + end + end + raise ArgumentError, "Server %s not online" % hostname unless servername + return servername +end + +# Create a logger and disable the DEBUG-level annoying client logging +def configureLogging(options) + apacheLogger = LogFactory.getLog(NAME) + # Configure log4j to not spew so much + unless (options[:debug]) + logger = org.apache.log4j.Logger.getLogger("org.apache.hadoop.hbase.client") + logger.setLevel(org.apache.log4j.Level::INFO) + end + return apacheLogger +end + +# Get configuration instance +def getConfiguration() + config = HBaseConfiguration.create() + # No prefetching on .META. + config.setInt("hbase.client.prefetch.limit", 1) + # Make a config that retries at short intervals many times + config.setInt("hbase.client.pause", 500) + config.setInt("hbase.client.retries.number", 100) + return config +end + +# Now get list of regions on targetServer +def getRegions(config, servername) + connection = HConnectionManager::getConnection(config) + hsa = HServerAddress.new(getHostnamePortFromServerName(servername)) + rs = connection.getHRegionConnection(hsa) + return rs.getOnlineRegions() +end + +def deleteFile(filename) + f = java.io.File.new(filename) + f.delete() if f.exists() +end + +# Write HRegionInfo to file +# Need to serialize in case non-printable characters. +# Format is count of regionnames followed by serialized regionnames. +def writeFile(filename, regions) + fos = java.io.FileOutputStream.new(filename) + dos = java.io.DataOutputStream.new(fos) + # Write out a count of region names + dos.writeInt(regions.size()) + # Write actual region names. + for r in regions + bytes = Writables.getBytes(r) + Bytes.writeByteArray(dos, bytes) + end + dos.close() +end + +# See writeFile above. +# Returns array of HRegionInfos +def readFile(filename) + f = java.io.File.new(filename) + return java.util.ArrayList.new() unless f.exists() + fis = java.io.FileInputStream.new(f) + dis = java.io.DataInputStream.new(fis) + # Read count of regions + count = dis.readInt() + regions = java.util.ArrayList.new(count) + index = 0 + while index < count + regions.add(Writables.getHRegionInfo(Bytes.readByteArray(dis))) + index = index + 1 + end + dis.close() + return regions +end + +# Move regions off the passed hostname +def unloadRegions(options, hostname) + # Get configuration + config = getConfiguration() + # Clean up any old files. + filename = getFilename(options, hostname) + deleteFile(filename) + # Get an admin instance + admin = HBaseAdmin.new(config) + servers = getServers(admin) + # Remove the server we are unloading from from list of servers. + # Side-effect is the servername that matches this hostname + servername = stripServer(servers, hostname) + movedRegions = java.util.ArrayList.new() + while true + rs = getRegions(config, servername) + break if rs.length == 0 + count = 0 + $LOG.info("Moving " + rs.length.to_s + " region(s) from " + servername + + " during this cycle"); + for r in rs + # Get a random server to move the region to. + server = servers[rand(servers.length)] + $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s + + " of " + rs.length.to_s + ") to server=" + server); + count = count + 1 + # Assert we can scan region in its current location + isSuccessfulScan(admin, r) + # Now move it. + move(admin, r, server, servername) + movedRegions.add(r) + end + end + if movedRegions.size() > 0 + # Write out file of regions moved + writeFile(filename, movedRegions) + $LOG.info("Wrote list of moved regions to " + filename) + end +end + +# Move regions to the passed hostname +def loadRegions(options, hostname) + # Get configuration + config = getConfiguration() + # Get an admin instance + admin = HBaseAdmin.new(config) + filename = getFilename(options, hostname) + regions = readFile(filename) + return if regions.isEmpty() + servername = nil + # Wait till server is up + maxWaitInSeconds = admin.getConfiguration.getInt("hbase.serverstart.wait.max", 180) + maxWait = Time.now + maxWaitInSeconds + while Time.now < maxWait + servers = getServers(admin) + begin + servername = getServerName(servers, hostname) + rescue ArgumentError => e + $LOG.info("hostname=" + hostname.to_s + " is not up yet, waiting"); + end + break if servername + sleep 0.5 + end + $LOG.info("Moving " + regions.size().to_s + " regions to " + servername) + count = 0 + for r in regions + exists = false + begin + exists = isSuccessfulScan(admin, r) + rescue org.apache.hadoop.hbase.NotServingRegionException => e + $LOG.info("Failed scan of " + e.message) + end + count = count + 1 + next unless exists + currentServer = getServerNameForRegion(admin, r) + if currentServer and currentServer == servername + $LOG.info("Region " + r.getRegionNameAsString() + " (" + count.to_s + + " of " + regions.length.to_s + ") already on target server=" + servername) + next + end + $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s + + " of " + regions.length.to_s + ") to server=" + servername); + move(admin, r, servername, currentServer) + end +end + +def getFilename(options, targetServer) + filename = options[:file] + if not filename + filename = "/tmp/" + targetServer + end + return filename +end + + +# Do command-line parsing +options = {} +optparse = OptionParser.new do |opts| + opts.banner = "Usage: #{NAME}.rb [options] load|unload " + opts.separator 'Load or unload regions by moving one at a time' + options[:file] = nil + opts.on('-f', '--filename=FILE', 'File to save regions list into unloading, or read from loading; default /tmp/') do |file| + options[:file] = file + end + opts.on('-h', '--help', 'Display usage information') do + puts opts + exit + end + options[:debug] = false + opts.on('-d', '--debug', 'Display extra debug logging') do + options[:debug] = true + end +end +optparse.parse! + +# Check ARGVs +if ARGV.length < 2 + puts optparse + exit 1 +end +hostname = ARGV[1] +if not hostname + opts optparse + exit 2 +end +# Create a logger and save it to ruby global +$LOG = configureLogging(options) +case ARGV[0] + when 'load' + loadRegions(options, hostname) + when 'unload' + unloadRegions(options, hostname) + else + puts optparse + exit 3 +end diff --git a/src/docbkx/book.xml b/src/docbkx/book.xml index fa2b143a083..5ae34e74056 100644 --- a/src/docbkx/book.xml +++ b/src/docbkx/book.xml @@ -1076,6 +1076,56 @@ public static byte[][] getHexSplits(String startKey, String endKey, int numRegio
Compression Tool See Compression Tool.
+
Node Decommission + You can have a node gradually shed its load and then shutdown using the + graceful_restart.sh script. Here is its usage: + $ ./bin/graceful_stop.sh +Usage: graceful_stop.sh [--config &conf-dir>] [--restart] [--reload] &hostname> + restart If we should restart after graceful stop + reload Move offloaded regions back on to the stopped server + debug Move offloaded regions back on to the stopped server + hostname Hostname of server we are to stop + + + To decommission a loaded regionserver, run the following: + $ ./bin/graceful_stop.sh HOSTNAME + where HOSTNAME is the host carrying the RegionServer + you would decommission. The script will move the regions off the + decommissioned regionserver one at a time to minimize region churn. + It will verify the region deployed in the new location before it + will moves the next region and so on until the decommissioned server + is carrying zero regions. At this point, the graceful_stop + tells the RegionServer stop. The master will at this point notice the + RegionServer gone but all regions will have already been redeployed + and because the RegionServer went down cleanly, there will be no + WAL logs to split. + Load Balancer + + It is assumed that the Region Load Balancer is disabled while the + graceful_stop script runs (otherwise the balancer + and the decommission script will end up fighting over region deployments). + Use the shell to disable the balancer: + hbase(main):001:0> balance_switch false +true +0 row(s) in 0.3590 seconds +This turns the balancer OFF. To reenable, do: + hbase(main):001:0> balance_switch true +false +0 row(s) in 0.3590 seconds + + + + + You can also ask this script to restart a RegionServer after the shutdown + AND move its old regions back into place. The latter you might do to + retain data locality. A primitive rolling restart might be effected by + running something like the following: + $ for i in `cat conf/regionservers|sort`; do ./bin/graceful_stop.sh --restart --reload --debug $i; done &> /tmp/log.txt & + + Tail the output of /tmp/log.txt to follow the scripts + progress. + +