diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 284e925f816..42288c56ec3 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -1362,6 +1362,7 @@ public final class HConstants { "hbase.regionserver.region.split.threads.max"; /** Canary config keys */ + // TODO: Move these defines to Canary Class public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl"; public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY = diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java index 7a549fce22c..40f4aa6654f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java @@ -90,6 +90,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKConfig; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.yetus.audience.InterfaceAudience; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.ZooKeeper; @@ -101,39 +102,45 @@ import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.collect.Lists; /** - * HBase Canary Tool, that that can be used to do - * "canary monitoring" of a running HBase cluster. + * HBase Canary Tool for "canary monitoring" of a running HBase cluster. * - * Here are three modes - * 1. region mode - Foreach region tries to get one row per column family - * and outputs some information about failure or latency. + * There are three modes: + *
    + *
  1. region mode (Default): For each region, try to get one row per column family outputting + * information on failure (ERROR) or else the latency. + *
  2. * - * 2. regionserver mode - Foreach regionserver tries to get one row from one table - * selected randomly and outputs some information about failure or latency. + *
  3. regionserver mode: For each regionserver try to get one row from one table selected + * randomly outputting information on failure (ERROR) or else the latency. + *
  4. * - * 3. zookeeper mode - for each zookeeper instance, selects a zNode and - * outputs some information about failure or latency. + *
  5. zookeeper mode: for each zookeeper instance, selects a znode outputting information on + * failure (ERROR) or else the latency. + *
  6. + *
*/ @InterfaceAudience.Private public final class Canary implements Tool { - // Sink interface used by the canary to outputs information + /** + * Sink interface used by the canary to output information + */ public interface Sink { - public long getReadFailureCount(); - public long incReadFailureCount(); - public Map getReadFailures(); - public void updateReadFailures(String regionName, String serverName); - public long getWriteFailureCount(); - public long incWriteFailureCount(); - public Map getWriteFailures(); - public void updateWriteFailures(String regionName, String serverName); + long getReadFailureCount(); + long incReadFailureCount(); + Map getReadFailures(); + void updateReadFailures(String regionName, String serverName); + long getWriteFailureCount(); + long incWriteFailureCount(); + Map getWriteFailures(); + void updateWriteFailures(String regionName, String serverName); } - // Simple implementation of canary sink that allows to plot on - // file or standard output timings or failures. + /** + * Simple implementation of canary sink that allows plotting to a file or standard output. + */ public static class StdOutSink implements Sink { private AtomicLong readFailureCount = new AtomicLong(0), writeFailureCount = new AtomicLong(0); - private Map readFailures = new ConcurrentHashMap<>(); private Map writeFailures = new ConcurrentHashMap<>(); @@ -178,67 +185,75 @@ public final class Canary implements Tool { } } + /** + * By RegionServer, for 'regionserver' mode. + */ public static class RegionServerStdOutSink extends StdOutSink { - public void publishReadFailure(String table, String server) { incReadFailureCount(); - LOG.error(String.format("Read from table:%s on region server:%s", table, server)); + LOG.error("Read from {} on {}", table, server); } public void publishReadTiming(String table, String server, long msTime) { - LOG.info(String.format("Read from table:%s on region server:%s in %dms", - table, server, msTime)); + LOG.info("Read from {} on {} in {}ms", table, server, msTime); } } + /** + * Output for 'zookeeper' mode. + */ public static class ZookeeperStdOutSink extends StdOutSink { - - public void publishReadFailure(String zNode, String server) { + public void publishReadFailure(String znode, String server) { incReadFailureCount(); - LOG.error(String.format("Read from zNode:%s on zookeeper instance:%s", zNode, server)); + LOG.error("Read from {} on {}", znode, server); } public void publishReadTiming(String znode, String server, long msTime) { - LOG.info(String.format("Read from zNode:%s on zookeeper instance:%s in %dms", - znode, server, msTime)); + LOG.info("Read from {} on {} in {}ms", znode, server, msTime); } } + /** + * By Region, for 'region' mode. + */ public static class RegionStdOutSink extends StdOutSink { - private Map perTableReadLatency = new HashMap<>(); private LongAdder writeLatency = new LongAdder(); public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) { incReadFailureCount(); - LOG.error(String.format("read from region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e); + LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e); } - public void publishReadFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) { + public void publishReadFailure(ServerName serverName, RegionInfo region, + ColumnFamilyDescriptor column, Exception e) { incReadFailureCount(); - LOG.error(String.format("read from region %s on regionserver %s column family %s failed", - region.getRegionNameAsString(), serverName, column.getNameAsString()), e); + LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName, + column.getNameAsString(), e); } - public void publishReadTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) { - LOG.info(String.format("read from region %s on regionserver %s column family %s in %dms", - region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime)); + public void publishReadTiming(ServerName serverName, RegionInfo region, + ColumnFamilyDescriptor column, long msTime) { + LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName, + column.getNameAsString(), msTime); } public void publishWriteFailure(ServerName serverName, RegionInfo region, Exception e) { incWriteFailureCount(); - LOG.error(String.format("write to region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e); + LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e); } - public void publishWriteFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) { + public void publishWriteFailure(ServerName serverName, RegionInfo region, + ColumnFamilyDescriptor column, Exception e) { incWriteFailureCount(); - LOG.error(String.format("write to region %s on regionserver %s column family %s failed", - region.getRegionNameAsString(), serverName, column.getNameAsString()), e); + LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName, + column.getNameAsString(), e); } - public void publishWriteTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) { - LOG.info(String.format("write to region %s on regionserver %s column family %s in %dms", - region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime)); + public void publishWriteTiming(ServerName serverName, RegionInfo region, + ColumnFamilyDescriptor column, long msTime) { + LOG.info("Write to {} on {} {} in {}ms", + region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime); } public Map getReadLatencyMap() { @@ -260,6 +275,9 @@ public final class Canary implements Tool { } } + /** + * Run a single zookeeper Task and then exit. + */ static class ZookeeperTask implements Callable { private final Connection connection; private final String host; @@ -298,8 +316,8 @@ public final class Canary implements Tool { } /** - * For each column family of the region tries to get one row and outputs the latency, or the - * failure. + * Run a single Region Task and then exit. For each column family of the Region, get one row and + * output latency or failure. */ static class RegionTask implements Callable { public enum TaskType{ @@ -313,8 +331,8 @@ public final class Canary implements Tool { private ServerName serverName; private LongAdder readWriteLatency; - RegionTask(Connection connection, RegionInfo region, ServerName serverName, RegionStdOutSink sink, - TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) { + RegionTask(Connection connection, RegionInfo region, ServerName serverName, + RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) { this.connection = connection; this.region = region; this.serverName = serverName; @@ -340,14 +358,11 @@ public final class Canary implements Tool { Table table = null; TableDescriptor tableDesc = null; try { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading table descriptor for table %s", - region.getTable())); - } + LOG.debug("Reading table descriptor for table {}", region.getTable()); table = connection.getTable(region.getTable()); tableDesc = table.getDescriptor(); } catch (IOException e) { - LOG.debug("sniffRegion failed", e); + LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e); sink.publishReadFailure(serverName, region, e); if (table != null) { try { @@ -375,10 +390,7 @@ public final class Canary implements Tool { get.addFamily(column.getName()); } else { scan = new Scan(); - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("rawScan : %s for table: %s", rawScanEnabled, - tableDesc.getTableName())); - } + LOG.debug("rawScan {} for {}", rawScanEnabled, tableDesc.getTableName()); scan.setRaw(rawScanEnabled); scan.setCaching(1); scan.setCacheBlocks(false); @@ -387,12 +399,9 @@ public final class Canary implements Tool { scan.setMaxResultSize(1L); scan.setOneRowLimit(); } - - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading from table %s region %s column family %s and key %s", - tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(), - Bytes.toStringBinary(startKey))); - } + LOG.debug("Reading from {} {} {} {}", tableDesc.getTableName(), + region.getRegionNameAsString(), column.getNameAsString(), + Bytes.toStringBinary(startKey)); try { stopWatch.start(); if (startKey.length > 0) { @@ -425,7 +434,6 @@ public final class Canary implements Tool { /** * Check writes for the canary table - * @return */ private Void write() { Table table = null; @@ -445,11 +453,9 @@ public final class Canary implements Tool { Bytes.random(value); put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value); - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("writing to table %s region %s column family %s and key %s", - tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(), - Bytes.toStringBinary(rowToCheck))); - } + LOG.debug("Writing to {} {} {} {}", + tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(), + Bytes.toStringBinary(rowToCheck)); try { long startTime = System.currentTimeMillis(); table.put(put); @@ -470,7 +476,8 @@ public final class Canary implements Tool { } /** - * Get one row from a region on the regionserver and outputs the latency, or the failure. + * Run a single RegionServer Task and then exit. + * Get one row from a region on the regionserver and output latency or the failure. */ static class RegionServerTask implements Callable { private Connection connection; @@ -503,11 +510,9 @@ public final class Canary implements Tool { table = connection.getTable(tableName); startKey = region.getStartKey(); // Can't do a get on empty start row so do a Scan of first element if any instead. - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading from region server %s table %s region %s and key %s", - serverName, region.getTable(), region.getRegionNameAsString(), - Bytes.toStringBinary(startKey))); - } + LOG.debug("Reading from {} {} {} {}", + serverName, region.getTable(), region.getRegionNameAsString(), + Bytes.toStringBinary(startKey)); if (startKey.length > 0) { get = new Get(startKey); get.setCacheBlocks(false); @@ -584,23 +589,43 @@ public final class Canary implements Tool { private boolean useRegExp; private long timeout = DEFAULT_TIMEOUT; private boolean failOnError = true; + + /** + * True if we are to run in 'regionServer' mode. + */ private boolean regionServerMode = false; + + /** + * True if we are to run in zookeeper 'mode'. + */ private boolean zookeeperMode = false; + private long permittedFailures = 0; private boolean regionServerAllRegions = false; private boolean writeSniffing = false; private long configuredWriteTableTimeout = DEFAULT_TIMEOUT; private boolean treatFailureAsError = false; private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME; + + /** + * This is a Map of table to timeout. The timeout is for reading all regions in the table; i.e. + * we aggregate time to fetch each region and it needs to be less than this value else we + * log an ERROR. + */ private HashMap configuredReadTableTimeouts = new HashMap<>(); private ExecutorService executor; // threads to retrieve data from regionservers public Canary() { - this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink()); + this(new ScheduledThreadPoolExecutor(1)); } - public Canary(ExecutorService executor, Sink sink) { + public Canary(ExecutorService executor) { + this(executor, null); + } + + @VisibleForTesting + Canary(ExecutorService executor, Sink sink) { this.executor = executor; this.sink = sink; } @@ -628,7 +653,7 @@ public final class Canary implements Tool { printUsageAndExit(); } - if (cmd.equals("-help")) { + if (cmd.equals("-help") || cmd.equals("-h")) { // user asked for help, print the help and quit. printUsageAndExit(); } else if (cmd.equals("-daemon") && interval == 0) { @@ -639,7 +664,7 @@ public final class Canary implements Tool { i++; if (i == args.length) { - System.err.println("-interval needs a numeric value argument."); + System.err.println("-interval takes a numeric seconds value argument."); printUsageAndExit(); } @@ -657,7 +682,7 @@ public final class Canary implements Tool { this.regionServerAllRegions = true; } else if(cmd.equals("-writeSniffing")) { this.writeSniffing = true; - } else if(cmd.equals("-treatFailureAsError")) { + } else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) { this.treatFailureAsError = true; } else if (cmd.equals("-e")) { this.useRegExp = true; @@ -665,35 +690,35 @@ public final class Canary implements Tool { i++; if (i == args.length) { - System.err.println("-t needs a numeric value argument."); + System.err.println("-t takes a numeric milliseconds value argument."); printUsageAndExit(); } try { this.timeout = Long.parseLong(args[i]); } catch (NumberFormatException e) { - System.err.println("-t needs a numeric value argument."); + System.err.println("-t takes a numeric milliseconds value argument."); printUsageAndExit(); } } else if(cmd.equals("-writeTableTimeout")) { i++; if (i == args.length) { - System.err.println("-writeTableTimeout needs a numeric value argument."); + System.err.println("-writeTableTimeout takes a numeric milliseconds value argument."); printUsageAndExit(); } try { this.configuredWriteTableTimeout = Long.parseLong(args[i]); } catch (NumberFormatException e) { - System.err.println("-writeTableTimeout needs a numeric value argument."); + System.err.println("-writeTableTimeout takes a numeric milliseconds value argument."); printUsageAndExit(); } } else if (cmd.equals("-writeTable")) { i++; if (i == args.length) { - System.err.println("-writeTable needs a string value argument."); + System.err.println("-writeTable takes a string tablename value argument."); printUsageAndExit(); } this.writeTableName = TableName.valueOf(args[i]); @@ -711,14 +736,16 @@ public final class Canary implements Tool { i++; if (i == args.length) { - System.err.println("-readTableTimeouts needs a comma-separated list of read timeouts per table (without spaces)."); + System.err.println("-readTableTimeouts needs a comma-separated list of read " + + "millisecond timeouts per table (without spaces)."); printUsageAndExit(); } String [] tableTimeouts = args[i].split(","); for (String tT: tableTimeouts) { String [] nameTimeout = tT.split("="); if (nameTimeout.length < 2) { - System.err.println("Each -readTableTimeouts argument must be of the form =."); + System.err.println("Each -readTableTimeouts argument must be of the form " + + "= (without spaces)."); printUsageAndExit(); } long timeoutVal = 0L; @@ -856,41 +883,56 @@ public final class Canary implements Tool { private void printUsageAndExit() { System.err.println( - "Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]"); - System.err.println(" where [opts] are:"); - System.err.println(" -help Show this help and exit."); - System.err.println(" -regionserver replace the table argument to regionserver,"); - System.err.println(" which means to enable regionserver mode"); - System.err.println(" -allRegions Tries all regions on a regionserver,"); - System.err.println(" only works in regionserver mode."); - System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent "); - System.err.println(" on each zookeeper instance"); - System.err.println(" -permittedZookeeperFailures Ignore first N failures when attempting to "); - System.err.println(" connect to individual zookeeper nodes in the ensemble"); - System.err.println(" -daemon Continuous check at defined intervals."); - System.err.println(" -interval Interval between checks (sec)"); - System.err.println(" -e Use table/regionserver as regular expression"); - System.err.println(" which means the table/regionserver is regular expression pattern"); - System.err.println(" -f stop whole program if first error occurs," + - " default is true"); - System.err.println(" -t timeout for a check, default is 600000 (millisecs)"); - System.err.println(" -writeTableTimeout write timeout for the writeTable, default is 600000 (millisecs)"); - System.err.println(" -readTableTimeouts =,=, ... " - + "comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)"); - System.err.println(" -writeSniffing enable the write sniffing in canary"); - System.err.println(" -treatFailureAsError treats read / write failure as error"); - System.err.println(" -writeTable The table used for write sniffing." - + " Default is hbase:canary"); - System.err.println(" -Dhbase.canary.read.raw.enabled= Use this flag to enable or disable raw scan during read canary test" - + " Default is false and raw is not enabled during scan"); - System.err - .println(" -D= assigning or override the configuration params"); + "Usage: canary [OPTIONS] [ [ [ interval between checks in seconds"); + System.err.println(" -e consider table/regionserver argument as regular " + + "expression"); + System.err.println(" -f exit on first error; default=true"); + System.err.println(" -failureAsError treat read/write failure as error"); + System.err.println(" -t timeout for canary-test run; default=600000ms"); + System.err.println(" -writeSniffing enable write sniffing"); + System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary"); + System.err.println(" -writeTableTimeout timeout for writeTable; default=600000ms"); + System.err.println(" -readTableTimeouts =," + + "=,..."); + System.err.println(" comma-separated list of table read timeouts " + + "(no spaces);"); + System.err.println(" logs 'ERROR' if takes longer. default=600000ms"); + System.err.println(" -permittedZookeeperFailures Ignore first N failures attempting to "); + System.err.println(" connect to individual zookeeper nodes in ensemble"); + System.err.println(""); + System.err.println(" -D= to assign or override configuration params"); + System.err.println(" -Dhbase.canary.read.raw.enabled= Set to enable/disable " + + "raw scan; default=false"); + System.err.println(""); + System.err.println("Canary runs in one of three modes: region (default), regionserver, or " + + "zookeeper."); + System.err.println("To sniff/probe all regions, pass no arguments."); + System.err.println("To sniff/probe all regions of a table, pass tablename."); + System.err.println("To sniff/probe regionservers, pass -regionserver, etc."); + System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation."); System.exit(USAGE_EXIT_CODE); } + Sink getSink(Configuration configuration, Class clazz) { + // In test context, this.sink might be set. Use it if non-null. For testing. + return this.sink != null? this.sink: + (Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class", + clazz, Sink.class)); + } + /** * A Factory method for {@link Monitor}. - * Can be overridden by user. + * Makes a RegionServerMonitor, or a ZooKeeperMonitor, or a RegionMonitor. * @param index a start index for monitor target * @param args args passed from user * @return a Monitor instance @@ -899,37 +941,45 @@ public final class Canary implements Tool { Monitor monitor = null; String[] monitorTargets = null; - if(index >= 0) { + if (index >= 0) { int length = args.length - index; monitorTargets = new String[length]; System.arraycopy(args, index, monitorTargets, 0, length); } - if (this.sink instanceof RegionServerStdOutSink || this.regionServerMode) { + if (this.regionServerMode) { monitor = new RegionServerMonitor(connection, monitorTargets, this.useRegExp, - (StdOutSink) this.sink, this.executor, this.regionServerAllRegions, + getSink(connection.getConfiguration(), RegionServerStdOutSink.class), + this.executor, this.regionServerAllRegions, this.treatFailureAsError, this.permittedFailures); - } else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) { + } else if (this.zookeeperMode) { monitor = new ZookeeperMonitor(connection, monitorTargets, this.useRegExp, - (StdOutSink) this.sink, this.executor, this.treatFailureAsError, + getSink(connection.getConfiguration(), ZookeeperStdOutSink.class), + this.executor, this.treatFailureAsError, this.permittedFailures); } else { monitor = new RegionMonitor(connection, monitorTargets, this.useRegExp, - (StdOutSink) this.sink, this.executor, this.writeSniffing, + getSink(connection.getConfiguration(), RegionStdOutSink.class), + this.executor, this.writeSniffing, this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts, this.configuredWriteTableTimeout, this.permittedFailures); } return monitor; } - // a Monitor super-class can be extended by users + /** + * A Monitor super-class can be extended by users + */ public static abstract class Monitor implements Runnable, Closeable { - protected Connection connection; protected Admin admin; + /** + * 'Target' dependent on 'mode'. Could be Tables or RegionServers or ZNodes. + * Passed on the command-line as arguments. + */ protected String[] targets; protected boolean useRegExp; protected boolean treatFailureAsError; @@ -999,7 +1049,9 @@ public final class Canary implements Tool { } } - // a monitor for region mode + /** + * A monitor for region mode. + */ private static class RegionMonitor extends Monitor { // 10 minutes private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000; @@ -1014,14 +1066,22 @@ public final class Canary implements Tool { private float regionsUpperLimit; private int checkPeriod; private boolean rawScanEnabled; + + /** + * This is a timeout per table. If read of each region in the table aggregated takes longer + * than what is configured here, we log an ERROR rather than just an INFO. + */ private HashMap configuredReadTableTimeouts; + private long configuredWriteTableTimeout; public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, - boolean treatFailureAsError, HashMap configuredReadTableTimeouts, long configuredWriteTableTimeout, + Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, + boolean treatFailureAsError, HashMap configuredReadTableTimeouts, + long configuredWriteTableTimeout, long allowedFailures) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, + allowedFailures); Configuration conf = connection.getConfiguration(); this.writeSniffing = writeSniffing; this.writeTableName = writeTableName; @@ -1054,9 +1114,12 @@ public final class Canary implements Tool { RegionStdOutSink regionSink = this.getSink(); if (this.targets != null && this.targets.length > 0) { String[] tables = generateMonitorTables(this.targets); - // Check to see that each table name passed in the -readTableTimeouts argument is also passed as a monitor target. - if (! new HashSet<>(Arrays.asList(tables)).containsAll(this.configuredReadTableTimeouts.keySet())) { - LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets passed via command line."); + // Check to see that each table name passed in the -readTableTimeouts argument is also + // passed as a monitor target. + if (!new HashSet<>(Arrays.asList(tables)). + containsAll(this.configuredReadTableTimeouts.keySet())) { + LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " + + "passed via command line."); this.errorCode = USAGE_EXIT_CODE; return; } @@ -1082,7 +1145,7 @@ public final class Canary implements Tool { // sniff canary table with write operation regionSink.initializeWriteLatency(); LongAdder writeTableLatency = regionSink.getWriteLatency(); - taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName), + taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getDescriptor(writeTableName), executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency)); } @@ -1099,23 +1162,26 @@ public final class Canary implements Tool { if (actualReadTableLatency.containsKey(tableName)) { Long actual = actualReadTableLatency.get(tableName).longValue(); Long configured = entry.getValue(); - LOG.info("Read operation for " + tableName + " took " + actual + - " ms. The configured read timeout was " + configured + " ms."); if (actual > configured) { - LOG.error("Read operation for " + tableName + " exceeded the configured read timeout."); + LOG.error("Read operation for {} took {}ms (Configured read timeout {}ms.", + tableName, actual, configured); + } else { + LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.", + tableName, actual, configured); } } else { - LOG.error("Read operation for " + tableName + " failed!"); + LOG.error("Read operation for {} failed!", tableName); } } if (this.writeSniffing) { String writeTableStringName = this.writeTableName.getNameAsString(); long actualWriteLatency = regionSink.getWriteLatency().longValue(); - LOG.info("Write operation for " + writeTableStringName + " took " + actualWriteLatency + " ms. The configured write timeout was " + - this.configuredWriteTableTimeout + " ms."); + LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.", + writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout); // Check that the writeTable write operation latency does not exceed the configured timeout. if (actualWriteLatency > this.configuredWriteTableTimeout) { - LOG.error("Write operation for " + writeTableStringName + " exceeded the configured write timeout."); + LOG.error("Write operation for {} exceeded the configured write timeout.", + writeTableStringName); } } } catch (Exception e) { @@ -1123,31 +1189,32 @@ public final class Canary implements Tool { this.errorCode = ERROR_EXIT_CODE; } finally { this.done = true; - } + } } this.done = true; } + /** + * @return List of tables to use in test. + */ private String[] generateMonitorTables(String[] monitorTargets) throws IOException { String[] returnTables = null; if (this.useRegExp) { Pattern pattern = null; - HTableDescriptor[] tds = null; + TableDescriptor[] tds = null; Set tmpTables = new TreeSet<>(); try { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading list of tables")); - } + LOG.debug(String.format("reading list of tables")); tds = this.admin.listTables(pattern); if (tds == null) { - tds = new HTableDescriptor[0]; + tds = new TableDescriptor[0]; } for (String monitorTarget : monitorTargets) { pattern = Pattern.compile(monitorTarget); - for (HTableDescriptor td : tds) { - if (pattern.matcher(td.getNameAsString()).matches()) { - tmpTables.add(td.getNameAsString()); + for (TableDescriptor td : tds) { + if (pattern.matcher(td.getTableName().getNameAsString()).matches()) { + tmpTables.add(td.getTableName().getNameAsString()); } } } @@ -1172,18 +1239,19 @@ public final class Canary implements Tool { } /* - * canary entry point to monitor all the tables. + * Canary entry point to monitor all the tables. */ - private List> sniff(TaskType taskType, RegionStdOutSink regionSink) throws Exception { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading list of tables")); - } + private List> sniff(TaskType taskType, RegionStdOutSink regionSink) + throws Exception { + LOG.debug("Reading list of tables"); List> taskFutures = new LinkedList<>(); - for (HTableDescriptor table : admin.listTables()) { - if (admin.isTableEnabled(table.getTableName()) - && (!table.getTableName().equals(writeTableName))) { - LongAdder readLatency = regionSink.initializeAndGetReadLatencyForTable(table.getNameAsString()); - taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType, this.rawScanEnabled, readLatency)); + for (TableDescriptor td: admin.listTableDescriptors()) { + if (admin.isTableEnabled(td.getTableName()) && + (!td.getTableName().equals(writeTableName))) { + LongAdder readLatency = + regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString()); + taskFutures.addAll(Canary.sniff(admin, sink, td, executor, taskType, this.rawScanEnabled, + readLatency)); } } return taskFutures; @@ -1231,11 +1299,10 @@ public final class Canary implements Tool { private void createWriteTable(int numberOfServers) throws IOException { int numberOfRegions = (int)(numberOfServers * regionsLowerLimit); - LOG.info("Number of live regionservers: " + numberOfServers + ", " - + "pre-splitting the canary table into " + numberOfRegions + " regions " - + "(current lower limit of regions per server is " + regionsLowerLimit - + " and you can change it by config: " - + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )"); + LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " + + "(current lower limit of regions per server is {} and you can change it with config {}).", + numberOfServers, numberOfRegions, regionsLowerLimit, + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY); HTableDescriptor desc = new HTableDescriptor(writeTableName); HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME); family.setMaxVersions(1); @@ -1252,59 +1319,40 @@ public final class Canary implements Tool { * @throws Exception */ private static List> sniff(final Admin admin, final Sink sink, String tableName, - ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency) throws Exception { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s", - tableName)); - } + ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency) + throws Exception { + LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName); if (admin.isTableEnabled(TableName.valueOf(tableName))) { - return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)), + return Canary.sniff(admin, sink, admin.getDescriptor(TableName.valueOf(tableName)), executor, taskType, rawScanEnabled, readLatency); } else { - LOG.warn(String.format("Table %s is not enabled", tableName)); + LOG.warn("Table {} is not enabled", tableName); } return new LinkedList<>(); } /* - * Loops over regions that owns this table, and output some information about the state. + * Loops over regions of this table, and outputs information about the state. */ private static List> sniff(final Admin admin, final Sink sink, - HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType, + TableDescriptor tableDesc, ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) throws Exception { - - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading list of regions for table %s", tableDesc.getTableName())); - } - - Table table = null; - try { - table = admin.getConnection().getTable(tableDesc.getTableName()); + LOG.debug("Reading list of regions for table {}", tableDesc.getTableName()); + try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) { + List tasks = new ArrayList<>(); + try (RegionLocator regionLocator = + admin.getConnection().getRegionLocator(tableDesc.getTableName())) { + for (HRegionLocation location: regionLocator.getAllRegionLocations()) { + ServerName rs = location.getServerName(); + RegionInfo region = location.getRegion(); + tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink, + taskType, rawScanEnabled, rwLatency)); + } + return executor.invokeAll(tasks); + } } catch (TableNotFoundException e) { - return new ArrayList<>(); + return Collections.EMPTY_LIST; } - finally { - if (table !=null) { - table.close(); - } - } - - List tasks = new ArrayList<>(); - RegionLocator regionLocator = null; - try { - regionLocator = admin.getConnection().getRegionLocator(tableDesc.getTableName()); - for (HRegionLocation location : regionLocator.getAllRegionLocations()) { - ServerName rs = location.getServerName(); - RegionInfo region = location.getRegionInfo(); - tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink) sink, taskType, rawScanEnabled, - rwLatency)); - } - } finally { - if (regionLocator != null) { - regionLocator.close(); - } - } - return executor.invokeAll(tasks); } // monitor for zookeeper mode @@ -1314,8 +1362,9 @@ public final class Canary implements Tool { private final int timeout; protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); + Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { + super(connection, monitorTargets, useRegExp, + sink, executor, treatFailureAsError, allowedFailures); Configuration configuration = connection.getConfiguration(); znode = configuration.get(ZOOKEEPER_ZNODE_PARENT, @@ -1374,15 +1423,17 @@ public final class Canary implements Tool { } - // a monitor for regionserver mode + /** + * A monitor for regionserver mode + */ private static class RegionServerMonitor extends Monitor { - private boolean allRegions; public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - StdOutSink sink, ExecutorService executor, boolean allRegions, + Sink sink, ExecutorService executor, boolean allRegions, boolean treatFailureAsError, long allowedFailures) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, + allowedFailures); this.allRegions = allRegions; } @@ -1413,10 +1464,7 @@ public final class Canary implements Tool { private boolean checkNoTableNames() { List foundTableNames = new ArrayList<>(); TableName[] tableNames = null; - - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading list of tables")); - } + LOG.debug("Reading list of tables"); try { tableNames = this.admin.listTableNames(); } catch (IOException e) { @@ -1452,7 +1500,7 @@ public final class Canary implements Tool { AtomicLong successes = new AtomicLong(0); successMap.put(serverName, successes); if (entry.getValue().isEmpty()) { - LOG.error(String.format("Regionserver not serving any regions - %s", serverName)); + LOG.error("Regionserver not serving any regions - {}", serverName); } else if (this.allRegions) { for (RegionInfo region : entry.getValue()) { tasks.add(new RegionServerTask(this.connection, @@ -1483,8 +1531,8 @@ public final class Canary implements Tool { if (this.allRegions) { for (Map.Entry> entry : rsAndRMap.entrySet()) { String serverName = entry.getKey(); - LOG.info("Successfully read " + successMap.get(serverName) + " regions out of " - + entry.getValue().size() + " on regionserver:" + serverName); + LOG.info("Successfully read {} regions out of {} on regionserver {}", + successMap.get(serverName), entry.getValue().size(), serverName); } } } catch (InterruptedException e) { @@ -1501,36 +1549,30 @@ public final class Canary implements Tool { private Map> getAllRegionServerByName() { Map> rsAndRMap = new HashMap<>(); - Table table = null; - RegionLocator regionLocator = null; try { - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("reading list of tables and locations")); - } - HTableDescriptor[] tableDescs = this.admin.listTables(); + LOG.debug("Reading list of tables and locations"); + List tableDescs = this.admin.listTableDescriptors(); List regions = null; - for (HTableDescriptor tableDesc : tableDescs) { - table = this.admin.getConnection().getTable(tableDesc.getTableName()); - regionLocator = this.admin.getConnection().getRegionLocator(tableDesc.getTableName()); - - for (HRegionLocation location : regionLocator.getAllRegionLocations()) { - ServerName rs = location.getServerName(); - String rsName = rs.getHostname(); - RegionInfo r = location.getRegionInfo(); - - if (rsAndRMap.containsKey(rsName)) { - regions = rsAndRMap.get(rsName); - } else { - regions = new ArrayList<>(); - rsAndRMap.put(rsName, regions); + for (TableDescriptor tableDesc: tableDescs) { + try (RegionLocator regionLocator = + this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) { + for (HRegionLocation location : regionLocator.getAllRegionLocations()) { + ServerName rs = location.getServerName(); + String rsName = rs.getHostname(); + RegionInfo r = location.getRegion(); + if (rsAndRMap.containsKey(rsName)) { + regions = rsAndRMap.get(rsName); + } else { + regions = new ArrayList<>(); + rsAndRMap.put(rsName, regions); + } + regions.add(r); } - regions.add(r); } - table.close(); } // get any live regionservers not serving any regions - for (ServerName rs : this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) + for (ServerName rs: this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) .getLiveServerMetrics().keySet()) { String rsName = rs.getHostname(); if (!rsAndRMap.containsKey(rsName)) { @@ -1538,19 +1580,9 @@ public final class Canary implements Tool { } } } catch (IOException e) { - String msg = "Get HTables info failed"; - LOG.error(msg, e); + LOG.error("Get HTables info failed", e); this.errorCode = INIT_ERROR_EXIT_CODE; - } finally { - if (table != null) { - try { - table.close(); - } catch (IOException e) { - LOG.warn("Close table failed", e); - } - } } - return rsAndRMap; } @@ -1576,13 +1608,13 @@ public final class Canary implements Tool { } } if (!regExpFound) { - LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName); + LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName); } } else { if (fullRsAndRMap.containsKey(rsName)) { filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName)); } else { - LOG.info("No RegionServerInfo found, regionServerName:" + rsName); + LOG.info("No RegionServerInfo found, regionServerName {}", rsName); } } } @@ -1596,20 +1628,19 @@ public final class Canary implements Tool { public static void main(String[] args) throws Exception { final Configuration conf = HBaseConfiguration.create(); - // loading the generic options to conf + // Loading the generic options to conf new GenericOptionsParser(conf, args); int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM); - LOG.info("Number of execution threads " + numThreads); + LOG.info("Execution thread count={}", numThreads); + int exitCode = 0; ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads); - - Class sinkClass = - conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class); - Sink sink = ReflectionUtils.newInstance(sinkClass); - - int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args); - executor.shutdown(); + try { + exitCode = ToolRunner.run(conf, new Canary(executor), args); + } finally { + executor.shutdown(); + } System.exit(exitCode); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java index cdbf42623c7..6c23764a46f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; @@ -114,11 +115,11 @@ public class TestCanaryTool { ExecutorService executor = new ScheduledThreadPoolExecutor(1); Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink()); Canary canary = new Canary(executor, sink); - String[] args = { "-writeSniffing", "-t", "10000", name.getMethodName() }; + String[] args = { "-writeSniffing", "-t", "10000", tableName.getNameAsString() }; assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); assertEquals("verify no read error count", 0, canary.getReadFailures().size()); assertEquals("verify no write error count", 0, canary.getWriteFailures().size()); - verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class), + verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(RegionInfo.class), isA(ColumnFamilyDescriptor.class), anyLong()); } @@ -144,7 +145,8 @@ public class TestCanaryTool { Canary canary = new Canary(executor, sink); String configuredTimeoutStr = tableNames[0].getNameAsString() + "=" + Long.MAX_VALUE + "," + tableNames[1].getNameAsString() + "=0"; - String[] args = { "-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1", name.getMethodName() + "2"}; + String[] args = {"-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1", + name.getMethodName() + "2"}; assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); verify(sink, times(tableNames.length)).initializeAndGetReadLatencyForTable(isA(String.class)); for (int i=0; i<2; i++) { @@ -231,7 +233,7 @@ public class TestCanaryTool { conf.setBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, true); assertEquals(0, ToolRunner.run(conf, canary, args)); verify(sink, atLeastOnce()) - .publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class), + .publishReadTiming(isA(ServerName.class), isA(RegionInfo.class), isA(ColumnFamilyDescriptor.class), anyLong()); assertEquals("verify no read error count", 0, canary.getReadFailures().size()); } diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 0d0df588947..5645af52b82 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -82,45 +82,54 @@ Others, such as `hbase shell` (<>), `hbase upgrade` (<>), and === Canary -There is a Canary class can help users to canary-test the HBase cluster status, with every column-family for every regions or RegionServer's granularity. -To see the usage, use the `-help` parameter. +The Canary tool can help users "canary-test" the HBase cluster status. +The default "region mode" fetches a row from every column-family of every regions. +In "regionserver mode", the Canary tool will fetch a row from a random +region on each of the cluster's RegionServers. In "zookeeper mode", the +Canary will read the root znode on each member of the zookeeper ensemble. + +To see usage, pass the `-help` parameter (if you pass no +parameters, the Canary tool starts executing in the default +region "mode" fetching a row from every region in the cluster). ---- -$ ${HBASE_HOME}/bin/hbase canary -help +2018-10-16 13:11:27,037 INFO [main] tool.Canary: Execution thread count=16 +Usage: canary [OPTIONS] [ [ [ interval between checks in seconds + -e consider table/regionserver argument as regular expression + -f exit on first error; default=true + -failureAsError treat read/write failure as error + -t timeout for canary-test run; default=600000ms + -writeSniffing enable write sniffing + -writeTable the table used for write sniffing; default=hbase:canary + -writeTableTimeout timeout for writeTable; default=600000ms + -readTableTimeouts =,=,... + comma-separated list of table read timeouts (no spaces); + logs 'ERROR' if takes longer. default=600000ms + -permittedZookeeperFailures Ignore first N failures attempting to + connect to individual zookeeper nodes in ensemble -Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..] - where [opts] are: - -help Show this help and exit. - -regionserver replace the table argument to regionserver, - which means to enable regionserver mode - -allRegions Tries all regions on a regionserver, - only works in regionserver mode. - -zookeeper Tries to grab zookeeper.znode.parent - on each zookeeper instance - -daemon Continuous check at defined intervals. - -interval Interval between checks (sec) - -e Use table/regionserver as regular expression - which means the table/regionserver is regular expression pattern - -f stop whole program if first error occurs, default is true - -t timeout for a check, default is 600000 (millisecs) - -writeTableTimeout write timeout for the writeTable, default is 600000 (millisecs) - -readTableTimeouts =,=, ... comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs) - -writeSniffing enable the write sniffing in canary - -treatFailureAsError treats read / write failure as error - -writeTable The table used for write sniffing. Default is hbase:canary - -Dhbase.canary.read.raw.enabled= Use this flag to enable or disable raw scan during read canary test Default is false and raw is not enabled during scan - -D= assigning or override the configuration params + -D= to assign or override configuration params + -Dhbase.canary.read.raw.enabled= Set to enable/disable raw scan; default=false + +Canary runs in one of three modes: region (default), regionserver, or zookeeper. +To sniff/probe all regions, pass no arguments. +To sniff/probe all regions of a table, pass tablename. +To sniff/probe regionservers, pass -regionserver, etc. +See http://hbase.apache.org/book.html#_canary for Canary documentation. ---- [NOTE] -The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property which -will also determine the used Monitor class. In the absence of this property RegionServerStdOutSink -will be used. You need to use the Sink according to the passed parameters to the _canary_ command. -As an example you have to set `hbase.canary.sink.class` property to -`org.apache.hadoop.hbase.tool.Canary$RegionStdOutSink` for using table parameters. +The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property. -This tool will return non zero error codes to user for collaborating with other monitoring tools, such as Nagios. -The error code definitions are: +This tool will return non zero error codes to user for collaborating with other monitoring tools, +such as Nagios. The error code definitions are: [source,java] ---- @@ -131,9 +140,9 @@ private static final int ERROR_EXIT_CODE = 4; private static final int FAILURE_EXIT_CODE = 5; ---- -Here are some examples based on the following given case. -There are two Table objects called test-01 and test-02, they have two column family cf1 and cf2 respectively, and deployed on the 3 RegionServers. -see following table. +Here are some examples based on the following given case: given two Table objects called test-01 +and test-02 each with two column family cf1 and cf2 respectively, deployed on 3 RegionServers. +See the following table. [cols="1,1,1", options="header"] |=== @@ -145,7 +154,7 @@ see following table. | rs3 | r2 | r1 |=== -Following are some examples based on the previous given case. +Following are some example outputs based on the previous given case. ==== Canary test for every column family (store) of every region of every table @@ -163,12 +172,13 @@ $ ${HBASE_HOME}/bin/hbase canary 13/12/09 03:26:32 INFO tool.Canary: read from region test-02,0004883,1386559511167.cbda32d5e2e276520712d84eaaa29d84. column family cf2 in 8ms ---- -So you can see, table test-01 has two regions and two column families, so the Canary tool will pick 4 small piece of data from 4 (2 region * 2 store) different stores. -This is a default behavior of the this tool does. +So you can see, table test-01 has two regions and two column families, so the Canary tool in the +default "region mode" will pick 4 small piece of data from 4 (2 region * 2 store) different stores. +This is a default behavior. -==== Canary test for every column family (store) of every region of specific table(s) +==== Canary test for every column family (store) of every region of a specific table(s) -You can also test one or more specific tables. +You can also test one or more specific tables by passing table names. ---- $ ${HBASE_HOME}/bin/hbase canary test-01 test-02 @@ -176,7 +186,9 @@ $ ${HBASE_HOME}/bin/hbase canary test-01 test-02 ==== Canary test with RegionServer granularity -This will pick one small piece of data from each RegionServer, and can also put your RegionServer name as input options for canary-test specific RegionServer. +In "regionserver mode", the Canary tool will pick one small piece of data +from each RegionServer (You can also pass one or more RegionServer names as arguments +to the canary-test when in "regionserver mode"). ---- $ ${HBASE_HOME}/bin/hbase canary -regionserver @@ -188,22 +200,25 @@ $ ${HBASE_HOME}/bin/hbase canary -regionserver ==== Canary test with regular expression pattern -This will test both table test-01 and test-02. +You can pass regexes for table names when in "region mode" or for servernames when +in "regionserver mode". The below will test both table test-01 and test-02. ---- $ ${HBASE_HOME}/bin/hbase canary -e test-0[1-2] ---- -==== Run canary test as daemon mode +==== Run canary test as a "daemon" -Run repeatedly with interval defined in option `-interval` whose default value is 60 seconds. -This daemon will stop itself and return non-zero error code if any error occurs, due to the default value of option -f is true. +Run repeatedly with an interval defined via the option `-interval` (default value is 60 seconds). +This daemon will stop itself and return non-zero error code if any error occur. To have +the daemon keep running across errors, pass the -f flag with its value set to false +(see usage above). ---- $ ${HBASE_HOME}/bin/hbase canary -daemon ---- -Run repeatedly with 5 second intervals and will not stop itself even if errors occur in the test. +To run repeatedly with 5 second intervals and not stop on errors, do the following. ---- $ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false @@ -211,9 +226,11 @@ $ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false ==== Force timeout if canary test stuck -In some cases the request is stuck and no response is sent back to the client. This can happen with dead RegionServers which the master has not yet noticed. -Because of this we provide a timeout option to kill the canary test and return a non-zero error code. -This run sets the timeout value to 60 seconds, the default value is 600 seconds. +In some cases the request is stuck and no response is sent back to the client. This +can happen with dead RegionServers which the master has not yet noticed. +Because of this we provide a timeout option to kill the canary test and return a +non-zero error code. The below sets the timeout value to 60 seconds (the default value +is 600 seconds). ---- $ ${HBASE_HOME}/bin/hbase canary -t 60000 @@ -221,36 +238,37 @@ $ ${HBASE_HOME}/bin/hbase canary -t 60000 ==== Enable write sniffing in canary -By default, the canary tool only check the read operations, it's hard to find the problem in the -write path. To enable the write sniffing, you can run canary with the `-writeSniffing` option. -When the write sniffing is enabled, the canary tool will create an hbase table and make sure the -regions of the table distributed on all region servers. In each sniffing period, the canary will -try to put data to these regions to check the write availability of each region server. +By default, the canary tool only checks read operations. To enable the write sniffing, +you can run the canary with the `-writeSniffing` option set. When write sniffing is +enabled, the canary tool will create an hbase table and make sure the +regions of the table are distributed to all region servers. In each sniffing period, +the canary will try to put data to these regions to check the write availability of +each region server. ---- $ ${HBASE_HOME}/bin/hbase canary -writeSniffing ---- -The default write table is `hbase:canary` and can be specified by the option `-writeTable`. +The default write table is `hbase:canary` and can be specified with the option `-writeTable`. ---- $ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary ---- -The default value size of each put is 10 bytes and you can set it by the config key: +The default value size of each put is 10 bytes. You can set it via the config key: `hbase.canary.write.value.size`. ==== Treat read / write failure as error -By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException, -while returning normal exit code. To treat read / write failure as error, you can run canary -with the `-treatFailureAsError` option. When enabled, read / write failure would result in error -exit code. +By default, the canary tool only logs read failures -- due to e.g. RetriesExhaustedException, etc. -- +and will return the 'normal' exit code. To treat read/write failure as errors, you can run canary +with the `-treatFailureAsError` option. When enabled, read/write failures will result in an +error exit code. ---- $ ${HBASE_HOME}/bin/hbase canary -treatFailureAsError ---- ==== Running Canary in a Kerberos-enabled Cluster -To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_: +To run the Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_: * `hbase.client.keytab.file` * `hbase.client.kerberos.principal`