HBASE-7525 A canary monitoring program specifically for regionserver (takeshi.miao)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1535846 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
eclark 2013-10-25 20:02:50 +00:00
parent 4205ba6777
commit fcfefd81b4
1 changed files with 529 additions and 83 deletions

View File

@ -19,34 +19,45 @@
package org.apache.hadoop.hbase.tool; package org.apache.hadoop.hbase.tool;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.time.StopWatch; import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/** /**
* HBase Canary Tool, that that can be used to do * HBase Canary Tool, that that can be used to do
* "canary monitoring" of a running HBase cluster. * "canary monitoring" of a running HBase cluster.
* *
* Foreach region tries to get one row per column family * Here are two modes
* 1. region mode - Foreach region tries to get one row per column family
* and outputs some information about failure or latency. * and outputs some information about failure or latency.
*
* 2. regionserver mode - Foreach regionserver tries to get one row from one table
* selected randomly and outputs some information about failure or latency.
*/ */
public final class Canary implements Tool { public final class Canary implements Tool {
// Sink interface used by the canary to outputs information // Sink interface used by the canary to outputs information
@ -55,6 +66,12 @@ public final class Canary implements Tool {
public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e); public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime); public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
} }
// new extended sink for output regionserver mode info
// do not change the Sink interface directly due to maintaining the API
public interface ExtendedSink extends Sink {
public void publishReadFailure(String table, String server);
public void publishReadTiming(String table, String server, long msTime);
}
// Simple implementation of canary sink that allows to plot on // Simple implementation of canary sink that allows to plot on
// file or standard output timings or failures. // file or standard output timings or failures.
@ -76,18 +93,40 @@ public final class Canary implements Tool {
region.getRegionNameAsString(), column.getNameAsString(), msTime)); region.getRegionNameAsString(), column.getNameAsString(), msTime));
} }
} }
// a ExtendedSink implementation
public static class RegionServerStdOutSink extends StdOutSink implements ExtendedSink {
@Override
public void publishReadFailure(String table, String server) {
LOG.error(String.format("Read from table:%s on region server:%s",
table, server));
}
@Override
public void publishReadTiming(String table, String server, long msTime) {
LOG.info(String.format("Read from table:%s on region server:%s in %dms",
table, server, msTime));
}
}
private static final long DEFAULT_INTERVAL = 6000; private static final long DEFAULT_INTERVAL = 6000;
private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
private static final Log LOG = LogFactory.getLog(Canary.class); private static final Log LOG = LogFactory.getLog(Canary.class);
private Configuration conf = null; private Configuration conf = null;
private HBaseAdmin admin = null;
private long interval = 0; private long interval = 0;
private Sink sink = null; private Sink sink = null;
private boolean useRegExp;
private long timeout = DEFAULT_TIMEOUT;
private boolean failOnError = true;
private boolean regionServerMode = false;
public Canary() { public Canary() {
this(new StdOutSink()); this(new RegionServerStdOutSink());
} }
public Canary(Sink sink) { public Canary(Sink sink) {
@ -106,14 +145,14 @@ public final class Canary implements Tool {
@Override @Override
public int run(String[] args) throws Exception { public int run(String[] args) throws Exception {
int tables_index = -1; int index = -1;
// Process command line args // Process command line args
for (int i = 0; i < args.length; i++) { for (int i = 0; i < args.length; i++) {
String cmd = args[i]; String cmd = args[i];
if (cmd.startsWith("-")) { if (cmd.startsWith("-")) {
if (tables_index >= 0) { if (index >= 0) {
// command line args must be in the form: [opts] [table 1 [table 2 ...]] // command line args must be in the form: [opts] [table 1 [table 2 ...]]
System.err.println("Invalid command line options"); System.err.println("Invalid command line options");
printUsageAndExit(); printUsageAndExit();
@ -140,72 +179,271 @@ public final class Canary implements Tool {
System.err.println("-interval needs a numeric value argument."); System.err.println("-interval needs a numeric value argument.");
printUsageAndExit(); printUsageAndExit();
} }
} else if(cmd.equals("-regionserver")) {
this.regionServerMode = true;
} else if (cmd.equals("-e")) {
this.useRegExp = true;
} else if (cmd.equals("-t")) {
i++;
if (i == args.length) {
System.err.println("-t needs a numeric value argument.");
printUsageAndExit();
}
try {
this.timeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) {
System.err.println("-t needs a numeric value argument.");
printUsageAndExit();
}
} else if (cmd.equals("-f")) {
i++;
if (i == args.length) {
System.err
.println("-f needs a boolean value argument (true|false).");
printUsageAndExit();
}
this.failOnError = Boolean.parseBoolean(args[i]);
} else { } else {
// no options match // no options match
System.err.println(cmd + " options is invalid."); System.err.println(cmd + " options is invalid.");
printUsageAndExit(); printUsageAndExit();
} }
} else if (tables_index < 0) { } else if (index < 0) {
// keep track of first table name specified by the user // keep track of first table name specified by the user
tables_index = i; index = i;
} }
} }
// initialize HBase conf and admin // start to prepare the stuffs
if (conf == null) conf = HBaseConfiguration.create(); Monitor monitor = null;
admin = new HBaseAdmin(conf); Thread monitorThread = null;
try { long startTime = 0;
// lets the canary monitor the cluster long currentTimeLength = 0;
do { do {
if (admin.isAborted()) { // do monitor !!
LOG.error("HBaseAdmin aborted"); monitor = this.newMonitor(index, args);
return(1); monitorThread = new Thread(monitor);
startTime = System.currentTimeMillis();
monitorThread.start();
while (!monitor.isDone()) {
// wait for 1 sec
Thread.sleep(1000);
// exit if any error occurs
if (this.failOnError && monitor.hasError()) {
monitorThread.interrupt();
System.exit(1);
}
currentTimeLength = System.currentTimeMillis() - startTime;
if (currentTimeLength > this.timeout) {
LOG.error("The monitor is running too long (" + currentTimeLength
+ ") after timeout limit:" + this.timeout
+ " will be killed itself !!");
monitorThread.interrupt();
monitor.setError(true);
break;
}
} }
if (tables_index >= 0) { if (this.failOnError && monitor.hasError()) {
for (int i = tables_index; i < args.length; i++) { monitorThread.interrupt();
sniff(admin, sink, TableName.valueOf(args[i])); System.exit(1);
}
} else {
sniff();
} }
Thread.sleep(interval); Thread.sleep(interval);
} while (interval > 0); } while (interval > 0);
} finally {
this.admin.close();
}
return(0); return(monitor.hasError()? 1: 0);
} }
private void printUsageAndExit() { private void printUsageAndExit() {
System.err.printf("Usage: bin/hbase %s [opts] [table 1 [table 2...]]%n", getClass().getName()); System.err.printf(
"Usage: bin/hbase %s [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]%n",
getClass().getName());
System.err.println(" where [opts] are:"); System.err.println(" where [opts] are:");
System.err.println(" -help Show this help and exit."); System.err.println(" -help Show this help and exit.");
System.err.println(" -regionserver replace the table argument to regionserver,");
System.err.println(" which means to enable regionserver mode");
System.err.println(" -daemon Continuous check at defined intervals."); System.err.println(" -daemon Continuous check at defined intervals.");
System.err.println(" -interval <N> Interval between checks (sec)"); System.err.println(" -interval <N> Interval between checks (sec)");
System.err.println(" -e Use region/regionserver as regular expression");
System.err.println(" which means the region/regionserver is regular expression pattern");
System.err.println(" -f <B> stop whole program if first error occurs," +
" default is true");
System.err.println(" -t <N> timeout for a check, default is 600000 (milisecs)");
System.exit(1); System.exit(1);
} }
/**
* a Factory method for {@link Monitor}.
* Can be overrided by user.
* @param index a start index for monitor target
* @param args args passed from user
* @return a Monitor instance
*/
public Monitor newMonitor(int index, String[] args) {
Monitor monitor = null;
String[] monitorTargets = null;
if(index >= 0) {
int length = args.length - index;
monitorTargets = new String[length];
System.arraycopy(args, index, monitorTargets, 0, length);
}
if(this.regionServerMode) {
monitor = new RegionServerMonitor(this.conf, monitorTargets,
this.useRegExp, (ExtendedSink)this.sink);
} else {
monitor = new RegionMonitor(this.conf, monitorTargets, this.useRegExp, this.sink);
}
return monitor;
}
// a Monitor super-class can be extended by users
public static abstract class Monitor implements Runnable {
protected Configuration config;
protected HBaseAdmin admin;
protected String[] targets;
protected boolean useRegExp;
protected boolean done = false;
protected boolean error = false;
protected Sink sink;
public boolean isDone() {
return done;
}
public boolean hasError() {
return error;
}
public void setError(boolean error) {
this.error = error;
}
protected Monitor(Configuration config, String[] monitorTargets,
boolean useRegExp, Sink sink) {
if (null == config)
throw new IllegalArgumentException("config shall not be null");
this.config = config;
this.targets = monitorTargets;
this.useRegExp = useRegExp;
this.sink = sink;
}
public abstract void run();
protected boolean initAdmin() {
if (null == this.admin) {
try {
this.admin = new HBaseAdmin(config);
} catch (Exception e) {
LOG.error("Initial HBaseAdmin failed...", e);
this.error = true;
}
}
if (admin.isAborted()) {
LOG.error("HBaseAdmin aborted");
this.error = true;
}
return !this.error;
}
}
// a monitor for region mode
private static class RegionMonitor extends Monitor {
public RegionMonitor(Configuration config, String[] monitorTargets,
boolean useRegExp, Sink sink) {
super(config, monitorTargets, useRegExp, sink);
}
@Override
public void run() {
if(this.initAdmin()) {
try {
if (this.targets != null && this.targets.length > 0) {
String[] tables = generateMonitorTables(this.targets);
for (String table : tables) {
Canary.sniff(admin, sink, table);
}
} else {
sniff();
}
} catch (Exception e) {
LOG.error("Run regionMonitor failed", e);
this.error = true;
}
}
this.done = true;
}
private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
String[] returnTables = null;
if(this.useRegExp) {
Pattern pattern = null;
HTableDescriptor[] tds = null;
Set<String> tmpTables = new TreeSet<String>();
try {
for(int a = 0; a < monitorTargets.length; a++) {
pattern = Pattern.compile(monitorTargets[a]);
tds = this.admin.listTables(pattern);
if(tds != null) {
for(HTableDescriptor td : tds) {
tmpTables.add(td.getNameAsString());
}
}
}
} catch(IOException e) {
LOG.error("Communicate with admin failed", e);
throw e;
}
if(tmpTables.size() > 0) {
returnTables = tmpTables.toArray(new String[]{});
} else {
String msg = "No any HTable found, tablePattern:"
+ Arrays.toString(monitorTargets);
LOG.error(msg);
this.error = true;
new TableNotFoundException(msg);
}
} else {
returnTables = monitorTargets;
}
return returnTables;
}
/* /*
* canary entry point to monitor all the tables. * canary entry point to monitor all the tables.
*/ */
private void sniff() throws Exception { private void sniff() throws Exception {
for (HTableDescriptor table : admin.listTables()) { for (HTableDescriptor table : admin.listTables()) {
sniff(admin, sink, table); Canary.sniff(admin, sink, table);
} }
} }
}
/** /**
* Canary entry point for specified table. * Canary entry point for specified table.
* @param admin * @param admin
* @param tableName * @param tableName
* @throws Exception * @throws Exception
*/ */
public static void sniff(final HBaseAdmin admin, TableName tableName) public static void sniff(final HBaseAdmin admin, TableName tableName) throws Exception {
throws Exception { sniff(admin, new StdOutSink(), tableName.getNameAsString());
sniff(admin, new StdOutSink(), tableName);
} }
/** /**
@ -215,80 +453,288 @@ public final class Canary implements Tool {
* @param tableName * @param tableName
* @throws Exception * @throws Exception
*/ */
private static void sniff(final HBaseAdmin admin, final Sink sink, TableName tableName) private static void sniff(final HBaseAdmin admin, final Sink sink, String tableName)
throws Exception { throws Exception {
if (admin.isTableAvailable(tableName)) { if (admin.isTableAvailable(tableName)) {
sniff(admin, sink, admin.getTableDescriptor(tableName)); sniff(admin, sink, admin.getTableDescriptor(tableName.getBytes()));
} else { } else {
LOG.warn(String.format("Table %s is not available", tableName)); LOG.warn(String.format("Table %s is not available", tableName));
} }
} }
/* /*
* Loops over regions that owns this table, * Loops over regions that owns this table, and output some information abouts the state.
* and output some information abouts the state.
*/ */
private static void sniff(final HBaseAdmin admin, final Sink sink, HTableDescriptor tableDesc) private static void sniff(final HBaseAdmin admin, final Sink sink, HTableDescriptor tableDesc)
throws Exception { throws Exception {
HTable table = null; HTable table = null;
try { try {
table = new HTable(admin.getConfiguration(), tableDesc.getTableName()); table = new HTable(admin.getConfiguration(), tableDesc.getName());
} catch (TableNotFoundException e) { } catch (TableNotFoundException e) {
return; return;
} }
for (HRegionInfo region : admin.getTableRegions(tableDesc.getTableName())) { for (HRegionInfo region : admin.getTableRegions(tableDesc.getName())) {
try { try {
sniffRegion(admin, sink, region, table); sniffRegion(admin, sink, region, table);
} catch (Exception e) { } catch (Exception e) {
sink.publishReadFailure(region, e); sink.publishReadFailure(region, e);
LOG.debug("sniffRegion failed", e);
} }
} }
} }
/* /*
* For each column family of the region tries to get one row * For each column family of the region tries to get one row and outputs the latency, or the
* and outputs the latency, or the failure. * failure.
*/ */
private static void sniffRegion(final HBaseAdmin admin, final Sink sink, HRegionInfo region, private static void
HTable table) sniffRegion(final HBaseAdmin admin, final Sink sink, HRegionInfo region, HTable table)
throws Exception { throws Exception {
HTableDescriptor tableDesc = table.getTableDescriptor(); HTableDescriptor tableDesc = table.getTableDescriptor();
byte[] startKey = null;
Get get = null;
Scan scan = null;
ResultScanner rs = null;
StopWatch stopWatch = new StopWatch(); StopWatch stopWatch = new StopWatch();
for (HColumnDescriptor column : tableDesc.getColumnFamilies()) { for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
stopWatch.reset(); stopWatch.reset();
byte [] startKey = region.getStartKey(); startKey = region.getStartKey();
if (startKey == null || startKey.length <= 0) {
// Can't do a get on empty start row so do a Scan of first element if any instead. // Can't do a get on empty start row so do a Scan of first element if any instead.
Scan scan = new Scan(); if (startKey.length > 0) {
scan.addFamily(column.getName()); get = new Get(startKey);
scan.setBatch(1);
ResultScanner scanner = null;
try {
stopWatch.start();
scanner = table.getScanner(scan);
scanner.next();
stopWatch.stop();
sink.publishReadTiming(region, column, stopWatch.getTime());
} catch (Exception e) {
sink.publishReadFailure(region, column, e);
} finally {
if (scanner != null) scanner.close();
}
} else {
Get get = new Get(region.getStartKey());
get.addFamily(column.getName()); get.addFamily(column.getName());
} else {
scan = new Scan();
scan.setCaching(1);
scan.addFamily(column.getName());
scan.setMaxResultSize(1L);
}
try { try {
if (startKey.length > 0) {
stopWatch.start(); stopWatch.start();
table.get(get); table.get(get);
stopWatch.stop(); stopWatch.stop();
sink.publishReadTiming(region, column, stopWatch.getTime()); sink.publishReadTiming(region, column, stopWatch.getTime());
} else {
stopWatch.start();
rs = table.getScanner(scan);
stopWatch.stop();
sink.publishReadTiming(region, column, stopWatch.getTime());
}
} catch (Exception e) { } catch (Exception e) {
sink.publishReadFailure(region, column, e); sink.publishReadFailure(region, column, e);
} finally {
if (rs != null) {
rs.close();
}
scan = null;
get = null;
startKey = null;
} }
} }
} }
//a monitor for regionserver mode
private static class RegionServerMonitor extends Monitor {
public RegionServerMonitor(Configuration config, String[] monitorTargets,
boolean useRegExp, ExtendedSink sink) {
super(config, monitorTargets, useRegExp, sink);
}
private ExtendedSink getSink() {
return (ExtendedSink) this.sink;
}
@Override
public void run() {
if (this.initAdmin() && this.checkNoTableNames()) {
Map<String, List<HRegionInfo>> rsAndRMap = null;
rsAndRMap = this.filterRegionServerByName();
this.monitorRegionServers(rsAndRMap);
}
this.done = true;
}
private boolean checkNoTableNames() {
boolean pass = true;
List<String> foundTableNames = new ArrayList<String>();
TableName[] tableNames = null;
if (this.targets == null || this.targets.length == 0) return pass;
try {
tableNames = this.admin.listTableNames();
} catch (IOException e) {
LOG.error("Get listTableNames failed", e);
this.error = true;
return false;
}
for (String target : this.targets) {
for (TableName tableName : tableNames) {
if (target.equals(tableName.getNameAsString())) {
pass = false;
foundTableNames.add(target);
}
}
}
if (!pass) {
System.err
.println("Cannot pass a tablename when using the -regionserver option, tablenames:"
+ foundTableNames.toString());
this.error = true;
}
return pass;
}
private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap) {
String serverName = null;
String tableName = null;
HRegionInfo region = null;
HTable table = null;
Get get = null;
byte[] startKey = null;
Scan scan = null;
StopWatch stopWatch = new StopWatch();
// monitor one region on every region server
for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
stopWatch.reset();
serverName = entry.getKey();
// always get the first region
region = entry.getValue().get(0);
try {
tableName = region.getTable().getNameAsString();
table = new HTable(this.admin.getConfiguration(), tableName);
startKey = region.getStartKey();
// Can't do a get on empty start row so do a Scan of first element if any instead.
if(startKey.length > 0) {
get = new Get(startKey);
stopWatch.start();
table.get(get);
stopWatch.stop();
} else {
scan = new Scan();
scan.setCaching(1);
scan.setMaxResultSize(1L);
stopWatch.start();
table.getScanner(scan);
stopWatch.stop();
}
this.getSink().publishReadTiming(tableName, serverName, stopWatch.getTime());
} catch (IOException e) {
this.getSink().publishReadFailure(tableName, serverName);
LOG.error(e);
this.error = true;
} finally {
if (table != null) {
try {
table.close();
} catch (IOException e) {/* DO NOTHING */
}
}
scan = null;
get = null;
startKey = null;
}
}
}
private Map<String, List<HRegionInfo>> filterRegionServerByName() {
Map<String, List<HRegionInfo>> regionServerAndRegionsMap = this.
getAllRegionServerByName();
regionServerAndRegionsMap = this.doFilterRegionServerByName(
regionServerAndRegionsMap);
return regionServerAndRegionsMap;
}
private Map<String, List<HRegionInfo>> getAllRegionServerByName() {
Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<String, List<HRegionInfo>>();
HTable table = null;
try {
HTableDescriptor[] tableDescs = this.admin.listTables();
List<HRegionInfo> regions = null;
for (HTableDescriptor tableDesc : tableDescs) {
table = new HTable(this.admin.getConfiguration(), tableDesc.getName());
for (Map.Entry<HRegionInfo, ServerName> entry : table
.getRegionLocations().entrySet()) {
ServerName rs = entry.getValue();
String rsName = rs.getHostname();
HRegionInfo r = entry.getKey();
if (rsAndRMap.containsKey(rsName)) {
regions = rsAndRMap.get(rsName);
} else {
regions = new ArrayList<HRegionInfo>();
rsAndRMap.put(rsName, regions);
}
regions.add(r);
}
table.close();
}
} catch (IOException e) {
String msg = "Get HTables info failed";
LOG.error(msg, e);
this.error = true;
} finally {
if (table != null) {
try {
table.close();
} catch (IOException e) {
LOG.warn("Close table failed", e);
}
}
}
return rsAndRMap;
}
private Map<String, List<HRegionInfo>> doFilterRegionServerByName(
Map<String, List<HRegionInfo>> fullRsAndRMap) {
Map<String, List<HRegionInfo>> filteredRsAndRMap = null;
if (this.targets != null && this.targets.length > 0) {
filteredRsAndRMap = new HashMap<String, List<HRegionInfo>>();
Pattern pattern = null;
Matcher matcher = null;
boolean regExpFound = false;
for (String rsName : this.targets) {
if (this.useRegExp) {
regExpFound = false;
pattern = Pattern.compile(rsName);
for (String tmpRsName : fullRsAndRMap.keySet()) {
matcher = pattern.matcher(tmpRsName);
if (matcher.matches()) {
filteredRsAndRMap.put(tmpRsName, fullRsAndRMap.get(tmpRsName));
regExpFound = true;
}
}
if (!regExpFound) {
LOG.error("No any RegionServerInfo found, regionServerPattern:"
+ rsName);
this.error = true;
}
} else {
if (fullRsAndRMap.containsKey(rsName)) {
filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
} else {
LOG.error("No any RegionServerInfo found, regionServerName:"
+ rsName);
this.error = true;
}
}
}
} else {
filteredRsAndRMap = fullRsAndRMap;
}
return filteredRsAndRMap;
}
} }
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {