From fcfefd81b451035db86a11e7de0438834a4dd471 Mon Sep 17 00:00:00 2001 From: eclark Date: Fri, 25 Oct 2013 20:02:50 +0000 Subject: [PATCH] HBASE-7525 A canary monitoring program specifically for regionserver (takeshi.miao) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1535846 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/hadoop/hbase/tool/Canary.java | 612 +++++++++++++++--- 1 file changed, 529 insertions(+), 83 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java index 5239e1f8215..ed98a750a51 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java @@ -19,34 +19,45 @@ package org.apache.hadoop.hbase.tool; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.apache.commons.lang.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - import org.apache.hadoop.conf.Configuration; - +import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableNotFoundException; - import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; /** * HBase Canary Tool, that that can be used to do * "canary monitoring" of a running HBase cluster. * - * Foreach region tries to get one row per column family + * Here are two modes + * 1. region mode - Foreach region tries to get one row per column family * and outputs some information about failure or latency. + * + * 2. regionserver mode - Foreach regionserver tries to get one row from one table + * selected randomly and outputs some information about failure or latency. */ public final class Canary implements Tool { // Sink interface used by the canary to outputs information @@ -55,6 +66,12 @@ public final class Canary implements Tool { public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e); public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime); } + // new extended sink for output regionserver mode info + // do not change the Sink interface directly due to maintaining the API + public interface ExtendedSink extends Sink { + public void publishReadFailure(String table, String server); + public void publishReadTiming(String table, String server, long msTime); + } // Simple implementation of canary sink that allows to plot on // file or standard output timings or failures. @@ -76,18 +93,40 @@ public final class Canary implements Tool { region.getRegionNameAsString(), column.getNameAsString(), msTime)); } } + // a ExtendedSink implementation + public static class RegionServerStdOutSink extends StdOutSink implements ExtendedSink { + + @Override + public void publishReadFailure(String table, String server) { + LOG.error(String.format("Read from table:%s on region server:%s", + table, server)); + } + + @Override + public void publishReadTiming(String table, String server, long msTime) { + LOG.info(String.format("Read from table:%s on region server:%s in %dms", + table, server, msTime)); + } + } + private static final long DEFAULT_INTERVAL = 6000; + private static final long DEFAULT_TIMEOUT = 600000; // 10 mins + private static final Log LOG = LogFactory.getLog(Canary.class); private Configuration conf = null; - private HBaseAdmin admin = null; private long interval = 0; private Sink sink = null; + private boolean useRegExp; + private long timeout = DEFAULT_TIMEOUT; + private boolean failOnError = true; + private boolean regionServerMode = false; + public Canary() { - this(new StdOutSink()); + this(new RegionServerStdOutSink()); } public Canary(Sink sink) { @@ -106,14 +145,14 @@ public final class Canary implements Tool { @Override public int run(String[] args) throws Exception { - int tables_index = -1; + int index = -1; // Process command line args for (int i = 0; i < args.length; i++) { String cmd = args[i]; if (cmd.startsWith("-")) { - if (tables_index >= 0) { + if (index >= 0) { // command line args must be in the form: [opts] [table 1 [table 2 ...]] System.err.println("Invalid command line options"); printUsageAndExit(); @@ -140,61 +179,261 @@ public final class Canary implements Tool { System.err.println("-interval needs a numeric value argument."); printUsageAndExit(); } + } else if(cmd.equals("-regionserver")) { + this.regionServerMode = true; + } else if (cmd.equals("-e")) { + this.useRegExp = true; + } else if (cmd.equals("-t")) { + i++; + + if (i == args.length) { + System.err.println("-t needs a numeric value argument."); + printUsageAndExit(); + } + + try { + this.timeout = Long.parseLong(args[i]); + } catch (NumberFormatException e) { + System.err.println("-t needs a numeric value argument."); + printUsageAndExit(); + } + + } else if (cmd.equals("-f")) { + i++; + + if (i == args.length) { + System.err + .println("-f needs a boolean value argument (true|false)."); + printUsageAndExit(); + } + + this.failOnError = Boolean.parseBoolean(args[i]); } else { // no options match System.err.println(cmd + " options is invalid."); printUsageAndExit(); } - } else if (tables_index < 0) { + } else if (index < 0) { // keep track of first table name specified by the user - tables_index = i; + index = i; } } - // initialize HBase conf and admin - if (conf == null) conf = HBaseConfiguration.create(); - admin = new HBaseAdmin(conf); - try { - // lets the canary monitor the cluster - do { - if (admin.isAborted()) { - LOG.error("HBaseAdmin aborted"); - return(1); - } + // start to prepare the stuffs + Monitor monitor = null; + Thread monitorThread = null; + long startTime = 0; + long currentTimeLength = 0; - if (tables_index >= 0) { - for (int i = tables_index; i < args.length; i++) { - sniff(admin, sink, TableName.valueOf(args[i])); - } - } else { - sniff(); + do { + // do monitor !! + monitor = this.newMonitor(index, args); + monitorThread = new Thread(monitor); + startTime = System.currentTimeMillis(); + monitorThread.start(); + while (!monitor.isDone()) { + // wait for 1 sec + Thread.sleep(1000); + // exit if any error occurs + if (this.failOnError && monitor.hasError()) { + monitorThread.interrupt(); + System.exit(1); } + currentTimeLength = System.currentTimeMillis() - startTime; + if (currentTimeLength > this.timeout) { + LOG.error("The monitor is running too long (" + currentTimeLength + + ") after timeout limit:" + this.timeout + + " will be killed itself !!"); + monitorThread.interrupt(); + monitor.setError(true); + break; + } + } + + if (this.failOnError && monitor.hasError()) { + monitorThread.interrupt(); + System.exit(1); + } Thread.sleep(interval); - } while (interval > 0); - } finally { - this.admin.close(); - } + } while (interval > 0); - return(0); + return(monitor.hasError()? 1: 0); } private void printUsageAndExit() { - System.err.printf("Usage: bin/hbase %s [opts] [table 1 [table 2...]]%n", getClass().getName()); + System.err.printf( + "Usage: bin/hbase %s [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]%n", + getClass().getName()); System.err.println(" where [opts] are:"); System.err.println(" -help Show this help and exit."); + System.err.println(" -regionserver replace the table argument to regionserver,"); + System.err.println(" which means to enable regionserver mode"); System.err.println(" -daemon Continuous check at defined intervals."); System.err.println(" -interval Interval between checks (sec)"); + System.err.println(" -e Use region/regionserver as regular expression"); + System.err.println(" which means the region/regionserver is regular expression pattern"); + System.err.println(" -f stop whole program if first error occurs," + + " default is true"); + System.err.println(" -t timeout for a check, default is 600000 (milisecs)"); System.exit(1); } - /* - * canary entry point to monitor all the tables. + /** + * a Factory method for {@link Monitor}. + * Can be overrided by user. + * @param index a start index for monitor target + * @param args args passed from user + * @return a Monitor instance */ - private void sniff() throws Exception { - for (HTableDescriptor table : admin.listTables()) { - sniff(admin, sink, table); + public Monitor newMonitor(int index, String[] args) { + Monitor monitor = null; + String[] monitorTargets = null; + + if(index >= 0) { + int length = args.length - index; + monitorTargets = new String[length]; + System.arraycopy(args, index, monitorTargets, 0, length); } + + if(this.regionServerMode) { + monitor = new RegionServerMonitor(this.conf, monitorTargets, + this.useRegExp, (ExtendedSink)this.sink); + } else { + monitor = new RegionMonitor(this.conf, monitorTargets, this.useRegExp, this.sink); + } + return monitor; + } + + // a Monitor super-class can be extended by users + public static abstract class Monitor implements Runnable { + + protected Configuration config; + protected HBaseAdmin admin; + protected String[] targets; + protected boolean useRegExp; + + protected boolean done = false; + protected boolean error = false; + protected Sink sink; + + public boolean isDone() { + return done; + } + + public boolean hasError() { + return error; + } + + public void setError(boolean error) { + this.error = error; + } + + protected Monitor(Configuration config, String[] monitorTargets, + boolean useRegExp, Sink sink) { + if (null == config) + throw new IllegalArgumentException("config shall not be null"); + + this.config = config; + this.targets = monitorTargets; + this.useRegExp = useRegExp; + this.sink = sink; + } + + public abstract void run(); + + protected boolean initAdmin() { + if (null == this.admin) { + try { + this.admin = new HBaseAdmin(config); + } catch (Exception e) { + LOG.error("Initial HBaseAdmin failed...", e); + this.error = true; + } + } + if (admin.isAborted()) { + LOG.error("HBaseAdmin aborted"); + this.error = true; + } + return !this.error; + } + } + + // a monitor for region mode + private static class RegionMonitor extends Monitor { + + public RegionMonitor(Configuration config, String[] monitorTargets, + boolean useRegExp, Sink sink) { + super(config, monitorTargets, useRegExp, sink); + } + + @Override + public void run() { + if(this.initAdmin()) { + try { + if (this.targets != null && this.targets.length > 0) { + String[] tables = generateMonitorTables(this.targets); + for (String table : tables) { + Canary.sniff(admin, sink, table); + } + } else { + sniff(); + } + } catch (Exception e) { + LOG.error("Run regionMonitor failed", e); + this.error = true; + } + } + this.done = true; + } + + private String[] generateMonitorTables(String[] monitorTargets) throws IOException { + String[] returnTables = null; + + if(this.useRegExp) { + Pattern pattern = null; + HTableDescriptor[] tds = null; + Set tmpTables = new TreeSet(); + try { + for(int a = 0; a < monitorTargets.length; a++) { + pattern = Pattern.compile(monitorTargets[a]); + tds = this.admin.listTables(pattern); + if(tds != null) { + for(HTableDescriptor td : tds) { + tmpTables.add(td.getNameAsString()); + } + } + } + } catch(IOException e) { + LOG.error("Communicate with admin failed", e); + throw e; + } + + if(tmpTables.size() > 0) { + returnTables = tmpTables.toArray(new String[]{}); + } else { + String msg = "No any HTable found, tablePattern:" + + Arrays.toString(monitorTargets); + LOG.error(msg); + this.error = true; + new TableNotFoundException(msg); + } + } else { + returnTables = monitorTargets; + } + + return returnTables; + } + + /* + * canary entry point to monitor all the tables. + */ + private void sniff() throws Exception { + for (HTableDescriptor table : admin.listTables()) { + Canary.sniff(admin, sink, table); + } + } + } /** @@ -203,9 +442,8 @@ public final class Canary implements Tool { * @param tableName * @throws Exception */ - public static void sniff(final HBaseAdmin admin, TableName tableName) - throws Exception { - sniff(admin, new StdOutSink(), tableName); + public static void sniff(final HBaseAdmin admin, TableName tableName) throws Exception { + sniff(admin, new StdOutSink(), tableName.getNameAsString()); } /** @@ -215,79 +453,287 @@ public final class Canary implements Tool { * @param tableName * @throws Exception */ - private static void sniff(final HBaseAdmin admin, final Sink sink, TableName tableName) - throws Exception { + private static void sniff(final HBaseAdmin admin, final Sink sink, String tableName) + throws Exception { if (admin.isTableAvailable(tableName)) { - sniff(admin, sink, admin.getTableDescriptor(tableName)); + sniff(admin, sink, admin.getTableDescriptor(tableName.getBytes())); } else { LOG.warn(String.format("Table %s is not available", tableName)); } } /* - * Loops over regions that owns this table, - * and output some information abouts the state. + * Loops over regions that owns this table, and output some information abouts the state. */ private static void sniff(final HBaseAdmin admin, final Sink sink, HTableDescriptor tableDesc) - throws Exception { + throws Exception { HTable table = null; try { - table = new HTable(admin.getConfiguration(), tableDesc.getTableName()); + table = new HTable(admin.getConfiguration(), tableDesc.getName()); } catch (TableNotFoundException e) { return; } - for (HRegionInfo region : admin.getTableRegions(tableDesc.getTableName())) { + for (HRegionInfo region : admin.getTableRegions(tableDesc.getName())) { try { sniffRegion(admin, sink, region, table); } catch (Exception e) { sink.publishReadFailure(region, e); + LOG.debug("sniffRegion failed", e); } } } /* - * For each column family of the region tries to get one row - * and outputs the latency, or the failure. + * For each column family of the region tries to get one row and outputs the latency, or the + * failure. */ - private static void sniffRegion(final HBaseAdmin admin, final Sink sink, HRegionInfo region, - HTable table) - throws Exception { + private static void + sniffRegion(final HBaseAdmin admin, final Sink sink, HRegionInfo region, HTable table) + throws Exception { HTableDescriptor tableDesc = table.getTableDescriptor(); + byte[] startKey = null; + Get get = null; + Scan scan = null; + ResultScanner rs = null; StopWatch stopWatch = new StopWatch(); for (HColumnDescriptor column : tableDesc.getColumnFamilies()) { stopWatch.reset(); - byte [] startKey = region.getStartKey(); - if (startKey == null || startKey.length <= 0) { - // Can't do a get on empty start row so do a Scan of first element if any instead. - Scan scan = new Scan(); - scan.addFamily(column.getName()); - scan.setBatch(1); - ResultScanner scanner = null; - try { - stopWatch.start(); - scanner = table.getScanner(scan); - scanner.next(); - stopWatch.stop(); - sink.publishReadTiming(region, column, stopWatch.getTime()); - } catch (Exception e) { - sink.publishReadFailure(region, column, e); - } finally { - if (scanner != null) scanner.close(); - } - } else { - Get get = new Get(region.getStartKey()); + startKey = region.getStartKey(); + // Can't do a get on empty start row so do a Scan of first element if any instead. + if (startKey.length > 0) { + get = new Get(startKey); get.addFamily(column.getName()); - try { + } else { + scan = new Scan(); + scan.setCaching(1); + scan.addFamily(column.getName()); + scan.setMaxResultSize(1L); + } + + try { + if (startKey.length > 0) { stopWatch.start(); table.get(get); stopWatch.stop(); sink.publishReadTiming(region, column, stopWatch.getTime()); - } catch (Exception e) { - sink.publishReadFailure(region, column, e); + } else { + stopWatch.start(); + rs = table.getScanner(scan); + stopWatch.stop(); + sink.publishReadTiming(region, column, stopWatch.getTime()); + } + } catch (Exception e) { + sink.publishReadFailure(region, column, e); + } finally { + if (rs != null) { + rs.close(); + } + scan = null; + get = null; + startKey = null; + } + } + } + //a monitor for regionserver mode + private static class RegionServerMonitor extends Monitor { + + public RegionServerMonitor(Configuration config, String[] monitorTargets, + boolean useRegExp, ExtendedSink sink) { + super(config, monitorTargets, useRegExp, sink); + } + + private ExtendedSink getSink() { + return (ExtendedSink) this.sink; + } + + @Override + public void run() { + if (this.initAdmin() && this.checkNoTableNames()) { + Map> rsAndRMap = null; + rsAndRMap = this.filterRegionServerByName(); + this.monitorRegionServers(rsAndRMap); + } + this.done = true; + } + + private boolean checkNoTableNames() { + boolean pass = true; + List foundTableNames = new ArrayList(); + TableName[] tableNames = null; + + if (this.targets == null || this.targets.length == 0) return pass; + + try { + tableNames = this.admin.listTableNames(); + } catch (IOException e) { + LOG.error("Get listTableNames failed", e); + this.error = true; + return false; + } + + for (String target : this.targets) { + for (TableName tableName : tableNames) { + if (target.equals(tableName.getNameAsString())) { + pass = false; + foundTableNames.add(target); + } } } + + if (!pass) { + System.err + .println("Cannot pass a tablename when using the -regionserver option, tablenames:" + + foundTableNames.toString()); + this.error = true; + } + return pass; + } + + private void monitorRegionServers(Map> rsAndRMap) { + String serverName = null; + String tableName = null; + HRegionInfo region = null; + HTable table = null; + Get get = null; + byte[] startKey = null; + Scan scan = null; + StopWatch stopWatch = new StopWatch(); + // monitor one region on every region server + for (Map.Entry> entry : rsAndRMap.entrySet()) { + stopWatch.reset(); + serverName = entry.getKey(); + // always get the first region + region = entry.getValue().get(0); + try { + tableName = region.getTable().getNameAsString(); + table = new HTable(this.admin.getConfiguration(), tableName); + startKey = region.getStartKey(); + // Can't do a get on empty start row so do a Scan of first element if any instead. + if(startKey.length > 0) { + get = new Get(startKey); + stopWatch.start(); + table.get(get); + stopWatch.stop(); + } else { + scan = new Scan(); + scan.setCaching(1); + scan.setMaxResultSize(1L); + stopWatch.start(); + table.getScanner(scan); + stopWatch.stop(); + } + this.getSink().publishReadTiming(tableName, serverName, stopWatch.getTime()); + } catch (IOException e) { + this.getSink().publishReadFailure(tableName, serverName); + LOG.error(e); + this.error = true; + } finally { + if (table != null) { + try { + table.close(); + } catch (IOException e) {/* DO NOTHING */ + } + } + scan = null; + get = null; + startKey = null; + } + } + } + + private Map> filterRegionServerByName() { + Map> regionServerAndRegionsMap = this. + getAllRegionServerByName(); + regionServerAndRegionsMap = this.doFilterRegionServerByName( + regionServerAndRegionsMap); + return regionServerAndRegionsMap; + } + + private Map> getAllRegionServerByName() { + Map> rsAndRMap = new HashMap>(); + HTable table = null; + try { + HTableDescriptor[] tableDescs = this.admin.listTables(); + List regions = null; + for (HTableDescriptor tableDesc : tableDescs) { + table = new HTable(this.admin.getConfiguration(), tableDesc.getName()); + + for (Map.Entry entry : table + .getRegionLocations().entrySet()) { + ServerName rs = entry.getValue(); + String rsName = rs.getHostname(); + HRegionInfo r = entry.getKey(); + + if (rsAndRMap.containsKey(rsName)) { + regions = rsAndRMap.get(rsName); + } else { + regions = new ArrayList(); + rsAndRMap.put(rsName, regions); + } + regions.add(r); + } + table.close(); + } + + } catch (IOException e) { + String msg = "Get HTables info failed"; + LOG.error(msg, e); + this.error = true; + } finally { + if (table != null) { + try { + table.close(); + } catch (IOException e) { + LOG.warn("Close table failed", e); + } + } + } + + return rsAndRMap; + } + + private Map> doFilterRegionServerByName( + Map> fullRsAndRMap) { + + Map> filteredRsAndRMap = null; + + if (this.targets != null && this.targets.length > 0) { + filteredRsAndRMap = new HashMap>(); + Pattern pattern = null; + Matcher matcher = null; + boolean regExpFound = false; + for (String rsName : this.targets) { + if (this.useRegExp) { + regExpFound = false; + pattern = Pattern.compile(rsName); + for (String tmpRsName : fullRsAndRMap.keySet()) { + matcher = pattern.matcher(tmpRsName); + if (matcher.matches()) { + filteredRsAndRMap.put(tmpRsName, fullRsAndRMap.get(tmpRsName)); + regExpFound = true; + } + } + if (!regExpFound) { + LOG.error("No any RegionServerInfo found, regionServerPattern:" + + rsName); + this.error = true; + } + } else { + if (fullRsAndRMap.containsKey(rsName)) { + filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName)); + } else { + LOG.error("No any RegionServerInfo found, regionServerName:" + + rsName); + this.error = true; + } + } + } + } else { + filteredRsAndRMap = fullRsAndRMap; + } + return filteredRsAndRMap; } }