diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
index 284e925f816..42288c56ec3 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
@@ -1362,6 +1362,7 @@ public final class HConstants {
"hbase.regionserver.region.split.threads.max";
/** Canary config keys */
+ // TODO: Move these defines to Canary Class
public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl";
public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY =
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
index 7a549fce22c..40f4aa6654f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
@@ -90,6 +90,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKConfig;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
+import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper;
@@ -101,39 +102,45 @@ import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
/**
- * HBase Canary Tool, that that can be used to do
- * "canary monitoring" of a running HBase cluster.
+ * HBase Canary Tool for "canary monitoring" of a running HBase cluster.
*
- * Here are three modes
- * 1. region mode - Foreach region tries to get one row per column family
- * and outputs some information about failure or latency.
+ * There are three modes:
+ *
+ * - region mode (Default): For each region, try to get one row per column family outputting
+ * information on failure (ERROR) or else the latency.
+ *
*
- * 2. regionserver mode - Foreach regionserver tries to get one row from one table
- * selected randomly and outputs some information about failure or latency.
+ * - regionserver mode: For each regionserver try to get one row from one table selected
+ * randomly outputting information on failure (ERROR) or else the latency.
+ *
*
- * 3. zookeeper mode - for each zookeeper instance, selects a zNode and
- * outputs some information about failure or latency.
+ * - zookeeper mode: for each zookeeper instance, selects a znode outputting information on
+ * failure (ERROR) or else the latency.
+ *
+ *
*/
@InterfaceAudience.Private
public final class Canary implements Tool {
- // Sink interface used by the canary to outputs information
+ /**
+ * Sink interface used by the canary to output information
+ */
public interface Sink {
- public long getReadFailureCount();
- public long incReadFailureCount();
- public Map getReadFailures();
- public void updateReadFailures(String regionName, String serverName);
- public long getWriteFailureCount();
- public long incWriteFailureCount();
- public Map getWriteFailures();
- public void updateWriteFailures(String regionName, String serverName);
+ long getReadFailureCount();
+ long incReadFailureCount();
+ Map getReadFailures();
+ void updateReadFailures(String regionName, String serverName);
+ long getWriteFailureCount();
+ long incWriteFailureCount();
+ Map getWriteFailures();
+ void updateWriteFailures(String regionName, String serverName);
}
- // Simple implementation of canary sink that allows to plot on
- // file or standard output timings or failures.
+ /**
+ * Simple implementation of canary sink that allows plotting to a file or standard output.
+ */
public static class StdOutSink implements Sink {
private AtomicLong readFailureCount = new AtomicLong(0),
writeFailureCount = new AtomicLong(0);
-
private Map readFailures = new ConcurrentHashMap<>();
private Map writeFailures = new ConcurrentHashMap<>();
@@ -178,67 +185,75 @@ public final class Canary implements Tool {
}
}
+ /**
+ * By RegionServer, for 'regionserver' mode.
+ */
public static class RegionServerStdOutSink extends StdOutSink {
-
public void publishReadFailure(String table, String server) {
incReadFailureCount();
- LOG.error(String.format("Read from table:%s on region server:%s", table, server));
+ LOG.error("Read from {} on {}", table, server);
}
public void publishReadTiming(String table, String server, long msTime) {
- LOG.info(String.format("Read from table:%s on region server:%s in %dms",
- table, server, msTime));
+ LOG.info("Read from {} on {} in {}ms", table, server, msTime);
}
}
+ /**
+ * Output for 'zookeeper' mode.
+ */
public static class ZookeeperStdOutSink extends StdOutSink {
-
- public void publishReadFailure(String zNode, String server) {
+ public void publishReadFailure(String znode, String server) {
incReadFailureCount();
- LOG.error(String.format("Read from zNode:%s on zookeeper instance:%s", zNode, server));
+ LOG.error("Read from {} on {}", znode, server);
}
public void publishReadTiming(String znode, String server, long msTime) {
- LOG.info(String.format("Read from zNode:%s on zookeeper instance:%s in %dms",
- znode, server, msTime));
+ LOG.info("Read from {} on {} in {}ms", znode, server, msTime);
}
}
+ /**
+ * By Region, for 'region' mode.
+ */
public static class RegionStdOutSink extends StdOutSink {
-
private Map perTableReadLatency = new HashMap<>();
private LongAdder writeLatency = new LongAdder();
public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
incReadFailureCount();
- LOG.error(String.format("read from region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
+ LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
}
- public void publishReadFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) {
+ public void publishReadFailure(ServerName serverName, RegionInfo region,
+ ColumnFamilyDescriptor column, Exception e) {
incReadFailureCount();
- LOG.error(String.format("read from region %s on regionserver %s column family %s failed",
- region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
+ LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
+ column.getNameAsString(), e);
}
- public void publishReadTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) {
- LOG.info(String.format("read from region %s on regionserver %s column family %s in %dms",
- region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
+ public void publishReadTiming(ServerName serverName, RegionInfo region,
+ ColumnFamilyDescriptor column, long msTime) {
+ LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName,
+ column.getNameAsString(), msTime);
}
public void publishWriteFailure(ServerName serverName, RegionInfo region, Exception e) {
incWriteFailureCount();
- LOG.error(String.format("write to region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
+ LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e);
}
- public void publishWriteFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) {
+ public void publishWriteFailure(ServerName serverName, RegionInfo region,
+ ColumnFamilyDescriptor column, Exception e) {
incWriteFailureCount();
- LOG.error(String.format("write to region %s on regionserver %s column family %s failed",
- region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
+ LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName,
+ column.getNameAsString(), e);
}
- public void publishWriteTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) {
- LOG.info(String.format("write to region %s on regionserver %s column family %s in %dms",
- region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
+ public void publishWriteTiming(ServerName serverName, RegionInfo region,
+ ColumnFamilyDescriptor column, long msTime) {
+ LOG.info("Write to {} on {} {} in {}ms",
+ region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime);
}
public Map getReadLatencyMap() {
@@ -260,6 +275,9 @@ public final class Canary implements Tool {
}
}
+ /**
+ * Run a single zookeeper Task and then exit.
+ */
static class ZookeeperTask implements Callable {
private final Connection connection;
private final String host;
@@ -298,8 +316,8 @@ public final class Canary implements Tool {
}
/**
- * For each column family of the region tries to get one row and outputs the latency, or the
- * failure.
+ * Run a single Region Task and then exit. For each column family of the Region, get one row and
+ * output latency or failure.
*/
static class RegionTask implements Callable {
public enum TaskType{
@@ -313,8 +331,8 @@ public final class Canary implements Tool {
private ServerName serverName;
private LongAdder readWriteLatency;
- RegionTask(Connection connection, RegionInfo region, ServerName serverName, RegionStdOutSink sink,
- TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) {
+ RegionTask(Connection connection, RegionInfo region, ServerName serverName,
+ RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) {
this.connection = connection;
this.region = region;
this.serverName = serverName;
@@ -340,14 +358,11 @@ public final class Canary implements Tool {
Table table = null;
TableDescriptor tableDesc = null;
try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading table descriptor for table %s",
- region.getTable()));
- }
+ LOG.debug("Reading table descriptor for table {}", region.getTable());
table = connection.getTable(region.getTable());
tableDesc = table.getDescriptor();
} catch (IOException e) {
- LOG.debug("sniffRegion failed", e);
+ LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e);
sink.publishReadFailure(serverName, region, e);
if (table != null) {
try {
@@ -375,10 +390,7 @@ public final class Canary implements Tool {
get.addFamily(column.getName());
} else {
scan = new Scan();
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("rawScan : %s for table: %s", rawScanEnabled,
- tableDesc.getTableName()));
- }
+ LOG.debug("rawScan {} for {}", rawScanEnabled, tableDesc.getTableName());
scan.setRaw(rawScanEnabled);
scan.setCaching(1);
scan.setCacheBlocks(false);
@@ -387,12 +399,9 @@ public final class Canary implements Tool {
scan.setMaxResultSize(1L);
scan.setOneRowLimit();
}
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading from table %s region %s column family %s and key %s",
- tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
- Bytes.toStringBinary(startKey)));
- }
+ LOG.debug("Reading from {} {} {} {}", tableDesc.getTableName(),
+ region.getRegionNameAsString(), column.getNameAsString(),
+ Bytes.toStringBinary(startKey));
try {
stopWatch.start();
if (startKey.length > 0) {
@@ -425,7 +434,6 @@ public final class Canary implements Tool {
/**
* Check writes for the canary table
- * @return
*/
private Void write() {
Table table = null;
@@ -445,11 +453,9 @@ public final class Canary implements Tool {
Bytes.random(value);
put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("writing to table %s region %s column family %s and key %s",
- tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
- Bytes.toStringBinary(rowToCheck)));
- }
+ LOG.debug("Writing to {} {} {} {}",
+ tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
+ Bytes.toStringBinary(rowToCheck));
try {
long startTime = System.currentTimeMillis();
table.put(put);
@@ -470,7 +476,8 @@ public final class Canary implements Tool {
}
/**
- * Get one row from a region on the regionserver and outputs the latency, or the failure.
+ * Run a single RegionServer Task and then exit.
+ * Get one row from a region on the regionserver and output latency or the failure.
*/
static class RegionServerTask implements Callable {
private Connection connection;
@@ -503,11 +510,9 @@ public final class Canary implements Tool {
table = connection.getTable(tableName);
startKey = region.getStartKey();
// Can't do a get on empty start row so do a Scan of first element if any instead.
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading from region server %s table %s region %s and key %s",
- serverName, region.getTable(), region.getRegionNameAsString(),
- Bytes.toStringBinary(startKey)));
- }
+ LOG.debug("Reading from {} {} {} {}",
+ serverName, region.getTable(), region.getRegionNameAsString(),
+ Bytes.toStringBinary(startKey));
if (startKey.length > 0) {
get = new Get(startKey);
get.setCacheBlocks(false);
@@ -584,23 +589,43 @@ public final class Canary implements Tool {
private boolean useRegExp;
private long timeout = DEFAULT_TIMEOUT;
private boolean failOnError = true;
+
+ /**
+ * True if we are to run in 'regionServer' mode.
+ */
private boolean regionServerMode = false;
+
+ /**
+ * True if we are to run in zookeeper 'mode'.
+ */
private boolean zookeeperMode = false;
+
private long permittedFailures = 0;
private boolean regionServerAllRegions = false;
private boolean writeSniffing = false;
private long configuredWriteTableTimeout = DEFAULT_TIMEOUT;
private boolean treatFailureAsError = false;
private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
+
+ /**
+ * This is a Map of table to timeout. The timeout is for reading all regions in the table; i.e.
+ * we aggregate time to fetch each region and it needs to be less than this value else we
+ * log an ERROR.
+ */
private HashMap configuredReadTableTimeouts = new HashMap<>();
private ExecutorService executor; // threads to retrieve data from regionservers
public Canary() {
- this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink());
+ this(new ScheduledThreadPoolExecutor(1));
}
- public Canary(ExecutorService executor, Sink sink) {
+ public Canary(ExecutorService executor) {
+ this(executor, null);
+ }
+
+ @VisibleForTesting
+ Canary(ExecutorService executor, Sink sink) {
this.executor = executor;
this.sink = sink;
}
@@ -628,7 +653,7 @@ public final class Canary implements Tool {
printUsageAndExit();
}
- if (cmd.equals("-help")) {
+ if (cmd.equals("-help") || cmd.equals("-h")) {
// user asked for help, print the help and quit.
printUsageAndExit();
} else if (cmd.equals("-daemon") && interval == 0) {
@@ -639,7 +664,7 @@ public final class Canary implements Tool {
i++;
if (i == args.length) {
- System.err.println("-interval needs a numeric value argument.");
+ System.err.println("-interval takes a numeric seconds value argument.");
printUsageAndExit();
}
@@ -657,7 +682,7 @@ public final class Canary implements Tool {
this.regionServerAllRegions = true;
} else if(cmd.equals("-writeSniffing")) {
this.writeSniffing = true;
- } else if(cmd.equals("-treatFailureAsError")) {
+ } else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) {
this.treatFailureAsError = true;
} else if (cmd.equals("-e")) {
this.useRegExp = true;
@@ -665,35 +690,35 @@ public final class Canary implements Tool {
i++;
if (i == args.length) {
- System.err.println("-t needs a numeric value argument.");
+ System.err.println("-t takes a numeric milliseconds value argument.");
printUsageAndExit();
}
try {
this.timeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) {
- System.err.println("-t needs a numeric value argument.");
+ System.err.println("-t takes a numeric milliseconds value argument.");
printUsageAndExit();
}
} else if(cmd.equals("-writeTableTimeout")) {
i++;
if (i == args.length) {
- System.err.println("-writeTableTimeout needs a numeric value argument.");
+ System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
printUsageAndExit();
}
try {
this.configuredWriteTableTimeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) {
- System.err.println("-writeTableTimeout needs a numeric value argument.");
+ System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
printUsageAndExit();
}
} else if (cmd.equals("-writeTable")) {
i++;
if (i == args.length) {
- System.err.println("-writeTable needs a string value argument.");
+ System.err.println("-writeTable takes a string tablename value argument.");
printUsageAndExit();
}
this.writeTableName = TableName.valueOf(args[i]);
@@ -711,14 +736,16 @@ public final class Canary implements Tool {
i++;
if (i == args.length) {
- System.err.println("-readTableTimeouts needs a comma-separated list of read timeouts per table (without spaces).");
+ System.err.println("-readTableTimeouts needs a comma-separated list of read " +
+ "millisecond timeouts per table (without spaces).");
printUsageAndExit();
}
String [] tableTimeouts = args[i].split(",");
for (String tT: tableTimeouts) {
String [] nameTimeout = tT.split("=");
if (nameTimeout.length < 2) {
- System.err.println("Each -readTableTimeouts argument must be of the form =.");
+ System.err.println("Each -readTableTimeouts argument must be of the form " +
+ "= (without spaces).");
printUsageAndExit();
}
long timeoutVal = 0L;
@@ -856,41 +883,56 @@ public final class Canary implements Tool {
private void printUsageAndExit() {
System.err.println(
- "Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]");
- System.err.println(" where [opts] are:");
- System.err.println(" -help Show this help and exit.");
- System.err.println(" -regionserver replace the table argument to regionserver,");
- System.err.println(" which means to enable regionserver mode");
- System.err.println(" -allRegions Tries all regions on a regionserver,");
- System.err.println(" only works in regionserver mode.");
- System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent ");
- System.err.println(" on each zookeeper instance");
- System.err.println(" -permittedZookeeperFailures Ignore first N failures when attempting to ");
- System.err.println(" connect to individual zookeeper nodes in the ensemble");
- System.err.println(" -daemon Continuous check at defined intervals.");
- System.err.println(" -interval Interval between checks (sec)");
- System.err.println(" -e Use table/regionserver as regular expression");
- System.err.println(" which means the table/regionserver is regular expression pattern");
- System.err.println(" -f stop whole program if first error occurs," +
- " default is true");
- System.err.println(" -t timeout for a check, default is 600000 (millisecs)");
- System.err.println(" -writeTableTimeout write timeout for the writeTable, default is 600000 (millisecs)");
- System.err.println(" -readTableTimeouts =,=, ... "
- + "comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)");
- System.err.println(" -writeSniffing enable the write sniffing in canary");
- System.err.println(" -treatFailureAsError treats read / write failure as error");
- System.err.println(" -writeTable The table used for write sniffing."
- + " Default is hbase:canary");
- System.err.println(" -Dhbase.canary.read.raw.enabled= Use this flag to enable or disable raw scan during read canary test"
- + " Default is false and raw is not enabled during scan");
- System.err
- .println(" -D= assigning or override the configuration params");
+ "Usage: canary [OPTIONS] [ [ [ interval between checks in seconds");
+ System.err.println(" -e consider table/regionserver argument as regular " +
+ "expression");
+ System.err.println(" -f exit on first error; default=true");
+ System.err.println(" -failureAsError treat read/write failure as error");
+ System.err.println(" -t timeout for canary-test run; default=600000ms");
+ System.err.println(" -writeSniffing enable write sniffing");
+ System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary");
+ System.err.println(" -writeTableTimeout timeout for writeTable; default=600000ms");
+ System.err.println(" -readTableTimeouts =," +
+ "=,...");
+ System.err.println(" comma-separated list of table read timeouts " +
+ "(no spaces);");
+ System.err.println(" logs 'ERROR' if takes longer. default=600000ms");
+ System.err.println(" -permittedZookeeperFailures Ignore first N failures attempting to ");
+ System.err.println(" connect to individual zookeeper nodes in ensemble");
+ System.err.println("");
+ System.err.println(" -D= to assign or override configuration params");
+ System.err.println(" -Dhbase.canary.read.raw.enabled= Set to enable/disable " +
+ "raw scan; default=false");
+ System.err.println("");
+ System.err.println("Canary runs in one of three modes: region (default), regionserver, or " +
+ "zookeeper.");
+ System.err.println("To sniff/probe all regions, pass no arguments.");
+ System.err.println("To sniff/probe all regions of a table, pass tablename.");
+ System.err.println("To sniff/probe regionservers, pass -regionserver, etc.");
+ System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation.");
System.exit(USAGE_EXIT_CODE);
}
+ Sink getSink(Configuration configuration, Class clazz) {
+ // In test context, this.sink might be set. Use it if non-null. For testing.
+ return this.sink != null? this.sink:
+ (Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class",
+ clazz, Sink.class));
+ }
+
/**
* A Factory method for {@link Monitor}.
- * Can be overridden by user.
+ * Makes a RegionServerMonitor, or a ZooKeeperMonitor, or a RegionMonitor.
* @param index a start index for monitor target
* @param args args passed from user
* @return a Monitor instance
@@ -899,37 +941,45 @@ public final class Canary implements Tool {
Monitor monitor = null;
String[] monitorTargets = null;
- if(index >= 0) {
+ if (index >= 0) {
int length = args.length - index;
monitorTargets = new String[length];
System.arraycopy(args, index, monitorTargets, 0, length);
}
- if (this.sink instanceof RegionServerStdOutSink || this.regionServerMode) {
+ if (this.regionServerMode) {
monitor =
new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.regionServerAllRegions,
+ getSink(connection.getConfiguration(), RegionServerStdOutSink.class),
+ this.executor, this.regionServerAllRegions,
this.treatFailureAsError, this.permittedFailures);
- } else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) {
+ } else if (this.zookeeperMode) {
monitor =
new ZookeeperMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.treatFailureAsError,
+ getSink(connection.getConfiguration(), ZookeeperStdOutSink.class),
+ this.executor, this.treatFailureAsError,
this.permittedFailures);
} else {
monitor =
new RegionMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.writeSniffing,
+ getSink(connection.getConfiguration(), RegionStdOutSink.class),
+ this.executor, this.writeSniffing,
this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts,
this.configuredWriteTableTimeout, this.permittedFailures);
}
return monitor;
}
- // a Monitor super-class can be extended by users
+ /**
+ * A Monitor super-class can be extended by users
+ */
public static abstract class Monitor implements Runnable, Closeable {
-
protected Connection connection;
protected Admin admin;
+ /**
+ * 'Target' dependent on 'mode'. Could be Tables or RegionServers or ZNodes.
+ * Passed on the command-line as arguments.
+ */
protected String[] targets;
protected boolean useRegExp;
protected boolean treatFailureAsError;
@@ -999,7 +1049,9 @@ public final class Canary implements Tool {
}
}
- // a monitor for region mode
+ /**
+ * A monitor for region mode.
+ */
private static class RegionMonitor extends Monitor {
// 10 minutes
private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
@@ -1014,14 +1066,22 @@ public final class Canary implements Tool {
private float regionsUpperLimit;
private int checkPeriod;
private boolean rawScanEnabled;
+
+ /**
+ * This is a timeout per table. If read of each region in the table aggregated takes longer
+ * than what is configured here, we log an ERROR rather than just an INFO.
+ */
private HashMap configuredReadTableTimeouts;
+
private long configuredWriteTableTimeout;
public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
- boolean treatFailureAsError, HashMap configuredReadTableTimeouts, long configuredWriteTableTimeout,
+ Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
+ boolean treatFailureAsError, HashMap configuredReadTableTimeouts,
+ long configuredWriteTableTimeout,
long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
+ super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
+ allowedFailures);
Configuration conf = connection.getConfiguration();
this.writeSniffing = writeSniffing;
this.writeTableName = writeTableName;
@@ -1054,9 +1114,12 @@ public final class Canary implements Tool {
RegionStdOutSink regionSink = this.getSink();
if (this.targets != null && this.targets.length > 0) {
String[] tables = generateMonitorTables(this.targets);
- // Check to see that each table name passed in the -readTableTimeouts argument is also passed as a monitor target.
- if (! new HashSet<>(Arrays.asList(tables)).containsAll(this.configuredReadTableTimeouts.keySet())) {
- LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets passed via command line.");
+ // Check to see that each table name passed in the -readTableTimeouts argument is also
+ // passed as a monitor target.
+ if (!new HashSet<>(Arrays.asList(tables)).
+ containsAll(this.configuredReadTableTimeouts.keySet())) {
+ LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " +
+ "passed via command line.");
this.errorCode = USAGE_EXIT_CODE;
return;
}
@@ -1082,7 +1145,7 @@ public final class Canary implements Tool {
// sniff canary table with write operation
regionSink.initializeWriteLatency();
LongAdder writeTableLatency = regionSink.getWriteLatency();
- taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName),
+ taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getDescriptor(writeTableName),
executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency));
}
@@ -1099,23 +1162,26 @@ public final class Canary implements Tool {
if (actualReadTableLatency.containsKey(tableName)) {
Long actual = actualReadTableLatency.get(tableName).longValue();
Long configured = entry.getValue();
- LOG.info("Read operation for " + tableName + " took " + actual +
- " ms. The configured read timeout was " + configured + " ms.");
if (actual > configured) {
- LOG.error("Read operation for " + tableName + " exceeded the configured read timeout.");
+ LOG.error("Read operation for {} took {}ms (Configured read timeout {}ms.",
+ tableName, actual, configured);
+ } else {
+ LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.",
+ tableName, actual, configured);
}
} else {
- LOG.error("Read operation for " + tableName + " failed!");
+ LOG.error("Read operation for {} failed!", tableName);
}
}
if (this.writeSniffing) {
String writeTableStringName = this.writeTableName.getNameAsString();
long actualWriteLatency = regionSink.getWriteLatency().longValue();
- LOG.info("Write operation for " + writeTableStringName + " took " + actualWriteLatency + " ms. The configured write timeout was " +
- this.configuredWriteTableTimeout + " ms.");
+ LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.",
+ writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout);
// Check that the writeTable write operation latency does not exceed the configured timeout.
if (actualWriteLatency > this.configuredWriteTableTimeout) {
- LOG.error("Write operation for " + writeTableStringName + " exceeded the configured write timeout.");
+ LOG.error("Write operation for {} exceeded the configured write timeout.",
+ writeTableStringName);
}
}
} catch (Exception e) {
@@ -1123,31 +1189,32 @@ public final class Canary implements Tool {
this.errorCode = ERROR_EXIT_CODE;
} finally {
this.done = true;
- }
+ }
}
this.done = true;
}
+ /**
+ * @return List of tables to use in test.
+ */
private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
String[] returnTables = null;
if (this.useRegExp) {
Pattern pattern = null;
- HTableDescriptor[] tds = null;
+ TableDescriptor[] tds = null;
Set tmpTables = new TreeSet<>();
try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
+ LOG.debug(String.format("reading list of tables"));
tds = this.admin.listTables(pattern);
if (tds == null) {
- tds = new HTableDescriptor[0];
+ tds = new TableDescriptor[0];
}
for (String monitorTarget : monitorTargets) {
pattern = Pattern.compile(monitorTarget);
- for (HTableDescriptor td : tds) {
- if (pattern.matcher(td.getNameAsString()).matches()) {
- tmpTables.add(td.getNameAsString());
+ for (TableDescriptor td : tds) {
+ if (pattern.matcher(td.getTableName().getNameAsString()).matches()) {
+ tmpTables.add(td.getTableName().getNameAsString());
}
}
}
@@ -1172,18 +1239,19 @@ public final class Canary implements Tool {
}
/*
- * canary entry point to monitor all the tables.
+ * Canary entry point to monitor all the tables.
*/
- private List> sniff(TaskType taskType, RegionStdOutSink regionSink) throws Exception {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
+ private List> sniff(TaskType taskType, RegionStdOutSink regionSink)
+ throws Exception {
+ LOG.debug("Reading list of tables");
List> taskFutures = new LinkedList<>();
- for (HTableDescriptor table : admin.listTables()) {
- if (admin.isTableEnabled(table.getTableName())
- && (!table.getTableName().equals(writeTableName))) {
- LongAdder readLatency = regionSink.initializeAndGetReadLatencyForTable(table.getNameAsString());
- taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType, this.rawScanEnabled, readLatency));
+ for (TableDescriptor td: admin.listTableDescriptors()) {
+ if (admin.isTableEnabled(td.getTableName()) &&
+ (!td.getTableName().equals(writeTableName))) {
+ LongAdder readLatency =
+ regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString());
+ taskFutures.addAll(Canary.sniff(admin, sink, td, executor, taskType, this.rawScanEnabled,
+ readLatency));
}
}
return taskFutures;
@@ -1231,11 +1299,10 @@ public final class Canary implements Tool {
private void createWriteTable(int numberOfServers) throws IOException {
int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
- LOG.info("Number of live regionservers: " + numberOfServers + ", "
- + "pre-splitting the canary table into " + numberOfRegions + " regions "
- + "(current lower limit of regions per server is " + regionsLowerLimit
- + " and you can change it by config: "
- + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
+ LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " +
+ "(current lower limit of regions per server is {} and you can change it with config {}).",
+ numberOfServers, numberOfRegions, regionsLowerLimit,
+ HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY);
HTableDescriptor desc = new HTableDescriptor(writeTableName);
HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
family.setMaxVersions(1);
@@ -1252,59 +1319,40 @@ public final class Canary implements Tool {
* @throws Exception
*/
private static List> sniff(final Admin admin, final Sink sink, String tableName,
- ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency) throws Exception {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s",
- tableName));
- }
+ ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency)
+ throws Exception {
+ LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName);
if (admin.isTableEnabled(TableName.valueOf(tableName))) {
- return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
+ return Canary.sniff(admin, sink, admin.getDescriptor(TableName.valueOf(tableName)),
executor, taskType, rawScanEnabled, readLatency);
} else {
- LOG.warn(String.format("Table %s is not enabled", tableName));
+ LOG.warn("Table {} is not enabled", tableName);
}
return new LinkedList<>();
}
/*
- * Loops over regions that owns this table, and output some information about the state.
+ * Loops over regions of this table, and outputs information about the state.
*/
private static List> sniff(final Admin admin, final Sink sink,
- HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
+ TableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
boolean rawScanEnabled, LongAdder rwLatency) throws Exception {
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of regions for table %s", tableDesc.getTableName()));
- }
-
- Table table = null;
- try {
- table = admin.getConnection().getTable(tableDesc.getTableName());
+ LOG.debug("Reading list of regions for table {}", tableDesc.getTableName());
+ try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) {
+ List tasks = new ArrayList<>();
+ try (RegionLocator regionLocator =
+ admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
+ for (HRegionLocation location: regionLocator.getAllRegionLocations()) {
+ ServerName rs = location.getServerName();
+ RegionInfo region = location.getRegion();
+ tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink,
+ taskType, rawScanEnabled, rwLatency));
+ }
+ return executor.invokeAll(tasks);
+ }
} catch (TableNotFoundException e) {
- return new ArrayList<>();
+ return Collections.EMPTY_LIST;
}
- finally {
- if (table !=null) {
- table.close();
- }
- }
-
- List tasks = new ArrayList<>();
- RegionLocator regionLocator = null;
- try {
- regionLocator = admin.getConnection().getRegionLocator(tableDesc.getTableName());
- for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
- ServerName rs = location.getServerName();
- RegionInfo region = location.getRegionInfo();
- tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink) sink, taskType, rawScanEnabled,
- rwLatency));
- }
- } finally {
- if (regionLocator != null) {
- regionLocator.close();
- }
- }
- return executor.invokeAll(tasks);
}
// monitor for zookeeper mode
@@ -1314,8 +1362,9 @@ public final class Canary implements Tool {
private final int timeout;
protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
+ Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
+ super(connection, monitorTargets, useRegExp,
+ sink, executor, treatFailureAsError, allowedFailures);
Configuration configuration = connection.getConfiguration();
znode =
configuration.get(ZOOKEEPER_ZNODE_PARENT,
@@ -1374,15 +1423,17 @@ public final class Canary implements Tool {
}
- // a monitor for regionserver mode
+ /**
+ * A monitor for regionserver mode
+ */
private static class RegionServerMonitor extends Monitor {
-
private boolean allRegions;
public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean allRegions,
+ Sink sink, ExecutorService executor, boolean allRegions,
boolean treatFailureAsError, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
+ super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
+ allowedFailures);
this.allRegions = allRegions;
}
@@ -1413,10 +1464,7 @@ public final class Canary implements Tool {
private boolean checkNoTableNames() {
List foundTableNames = new ArrayList<>();
TableName[] tableNames = null;
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
+ LOG.debug("Reading list of tables");
try {
tableNames = this.admin.listTableNames();
} catch (IOException e) {
@@ -1452,7 +1500,7 @@ public final class Canary implements Tool {
AtomicLong successes = new AtomicLong(0);
successMap.put(serverName, successes);
if (entry.getValue().isEmpty()) {
- LOG.error(String.format("Regionserver not serving any regions - %s", serverName));
+ LOG.error("Regionserver not serving any regions - {}", serverName);
} else if (this.allRegions) {
for (RegionInfo region : entry.getValue()) {
tasks.add(new RegionServerTask(this.connection,
@@ -1483,8 +1531,8 @@ public final class Canary implements Tool {
if (this.allRegions) {
for (Map.Entry> entry : rsAndRMap.entrySet()) {
String serverName = entry.getKey();
- LOG.info("Successfully read " + successMap.get(serverName) + " regions out of "
- + entry.getValue().size() + " on regionserver:" + serverName);
+ LOG.info("Successfully read {} regions out of {} on regionserver {}",
+ successMap.get(serverName), entry.getValue().size(), serverName);
}
}
} catch (InterruptedException e) {
@@ -1501,36 +1549,30 @@ public final class Canary implements Tool {
private Map> getAllRegionServerByName() {
Map> rsAndRMap = new HashMap<>();
- Table table = null;
- RegionLocator regionLocator = null;
try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables and locations"));
- }
- HTableDescriptor[] tableDescs = this.admin.listTables();
+ LOG.debug("Reading list of tables and locations");
+ List tableDescs = this.admin.listTableDescriptors();
List regions = null;
- for (HTableDescriptor tableDesc : tableDescs) {
- table = this.admin.getConnection().getTable(tableDesc.getTableName());
- regionLocator = this.admin.getConnection().getRegionLocator(tableDesc.getTableName());
-
- for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
- ServerName rs = location.getServerName();
- String rsName = rs.getHostname();
- RegionInfo r = location.getRegionInfo();
-
- if (rsAndRMap.containsKey(rsName)) {
- regions = rsAndRMap.get(rsName);
- } else {
- regions = new ArrayList<>();
- rsAndRMap.put(rsName, regions);
+ for (TableDescriptor tableDesc: tableDescs) {
+ try (RegionLocator regionLocator =
+ this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
+ for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
+ ServerName rs = location.getServerName();
+ String rsName = rs.getHostname();
+ RegionInfo r = location.getRegion();
+ if (rsAndRMap.containsKey(rsName)) {
+ regions = rsAndRMap.get(rsName);
+ } else {
+ regions = new ArrayList<>();
+ rsAndRMap.put(rsName, regions);
+ }
+ regions.add(r);
}
- regions.add(r);
}
- table.close();
}
// get any live regionservers not serving any regions
- for (ServerName rs : this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS))
+ for (ServerName rs: this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS))
.getLiveServerMetrics().keySet()) {
String rsName = rs.getHostname();
if (!rsAndRMap.containsKey(rsName)) {
@@ -1538,19 +1580,9 @@ public final class Canary implements Tool {
}
}
} catch (IOException e) {
- String msg = "Get HTables info failed";
- LOG.error(msg, e);
+ LOG.error("Get HTables info failed", e);
this.errorCode = INIT_ERROR_EXIT_CODE;
- } finally {
- if (table != null) {
- try {
- table.close();
- } catch (IOException e) {
- LOG.warn("Close table failed", e);
- }
- }
}
-
return rsAndRMap;
}
@@ -1576,13 +1608,13 @@ public final class Canary implements Tool {
}
}
if (!regExpFound) {
- LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName);
+ LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName);
}
} else {
if (fullRsAndRMap.containsKey(rsName)) {
filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
} else {
- LOG.info("No RegionServerInfo found, regionServerName:" + rsName);
+ LOG.info("No RegionServerInfo found, regionServerName {}", rsName);
}
}
}
@@ -1596,20 +1628,19 @@ public final class Canary implements Tool {
public static void main(String[] args) throws Exception {
final Configuration conf = HBaseConfiguration.create();
- // loading the generic options to conf
+ // Loading the generic options to conf
new GenericOptionsParser(conf, args);
int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
- LOG.info("Number of execution threads " + numThreads);
+ LOG.info("Execution thread count={}", numThreads);
+ int exitCode = 0;
ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
-
- Class extends Sink> sinkClass =
- conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class);
- Sink sink = ReflectionUtils.newInstance(sinkClass);
-
- int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
- executor.shutdown();
+ try {
+ exitCode = ToolRunner.run(conf, new Canary(executor), args);
+ } finally {
+ executor.shutdown();
+ }
System.exit(exitCode);
}
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
index cdbf42623c7..6c23764a46f 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
@@ -114,11 +115,11 @@ public class TestCanaryTool {
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
Canary canary = new Canary(executor, sink);
- String[] args = { "-writeSniffing", "-t", "10000", name.getMethodName() };
+ String[] args = { "-writeSniffing", "-t", "10000", tableName.getNameAsString() };
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
assertEquals("verify no read error count", 0, canary.getReadFailures().size());
assertEquals("verify no write error count", 0, canary.getWriteFailures().size());
- verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class),
+ verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(RegionInfo.class),
isA(ColumnFamilyDescriptor.class), anyLong());
}
@@ -144,7 +145,8 @@ public class TestCanaryTool {
Canary canary = new Canary(executor, sink);
String configuredTimeoutStr = tableNames[0].getNameAsString() + "=" + Long.MAX_VALUE + "," +
tableNames[1].getNameAsString() + "=0";
- String[] args = { "-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1", name.getMethodName() + "2"};
+ String[] args = {"-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1",
+ name.getMethodName() + "2"};
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
verify(sink, times(tableNames.length)).initializeAndGetReadLatencyForTable(isA(String.class));
for (int i=0; i<2; i++) {
@@ -231,7 +233,7 @@ public class TestCanaryTool {
conf.setBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, true);
assertEquals(0, ToolRunner.run(conf, canary, args));
verify(sink, atLeastOnce())
- .publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class),
+ .publishReadTiming(isA(ServerName.class), isA(RegionInfo.class),
isA(ColumnFamilyDescriptor.class), anyLong());
assertEquals("verify no read error count", 0, canary.getReadFailures().size());
}
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index 0d0df588947..5645af52b82 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -82,45 +82,54 @@ Others, such as `hbase shell` (<>), `hbase upgrade` (<>), and
=== Canary
-There is a Canary class can help users to canary-test the HBase cluster status, with every column-family for every regions or RegionServer's granularity.
-To see the usage, use the `-help` parameter.
+The Canary tool can help users "canary-test" the HBase cluster status.
+The default "region mode" fetches a row from every column-family of every regions.
+In "regionserver mode", the Canary tool will fetch a row from a random
+region on each of the cluster's RegionServers. In "zookeeper mode", the
+Canary will read the root znode on each member of the zookeeper ensemble.
+
+To see usage, pass the `-help` parameter (if you pass no
+parameters, the Canary tool starts executing in the default
+region "mode" fetching a row from every region in the cluster).
----
-$ ${HBASE_HOME}/bin/hbase canary -help
+2018-10-16 13:11:27,037 INFO [main] tool.Canary: Execution thread count=16
+Usage: canary [OPTIONS] [ [ [ interval between checks in seconds
+ -e consider table/regionserver argument as regular expression
+ -f exit on first error; default=true
+ -failureAsError treat read/write failure as error
+ -t timeout for canary-test run; default=600000ms
+ -writeSniffing enable write sniffing
+ -writeTable the table used for write sniffing; default=hbase:canary
+ -writeTableTimeout timeout for writeTable; default=600000ms
+ -readTableTimeouts =,=,...
+ comma-separated list of table read timeouts (no spaces);
+ logs 'ERROR' if takes longer. default=600000ms
+ -permittedZookeeperFailures Ignore first N failures attempting to
+ connect to individual zookeeper nodes in ensemble
-Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]
- where [opts] are:
- -help Show this help and exit.
- -regionserver replace the table argument to regionserver,
- which means to enable regionserver mode
- -allRegions Tries all regions on a regionserver,
- only works in regionserver mode.
- -zookeeper Tries to grab zookeeper.znode.parent
- on each zookeeper instance
- -daemon Continuous check at defined intervals.
- -interval Interval between checks (sec)
- -e Use table/regionserver as regular expression
- which means the table/regionserver is regular expression pattern
- -f stop whole program if first error occurs, default is true
- -t timeout for a check, default is 600000 (millisecs)
- -writeTableTimeout write timeout for the writeTable, default is 600000 (millisecs)
- -readTableTimeouts =,=, ... comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)
- -writeSniffing enable the write sniffing in canary
- -treatFailureAsError treats read / write failure as error
- -writeTable The table used for write sniffing. Default is hbase:canary
- -Dhbase.canary.read.raw.enabled= Use this flag to enable or disable raw scan during read canary test Default is false and raw is not enabled during scan
- -D= assigning or override the configuration params
+ -D= to assign or override configuration params
+ -Dhbase.canary.read.raw.enabled= Set to enable/disable raw scan; default=false
+
+Canary runs in one of three modes: region (default), regionserver, or zookeeper.
+To sniff/probe all regions, pass no arguments.
+To sniff/probe all regions of a table, pass tablename.
+To sniff/probe regionservers, pass -regionserver, etc.
+See http://hbase.apache.org/book.html#_canary for Canary documentation.
----
[NOTE]
-The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property which
-will also determine the used Monitor class. In the absence of this property RegionServerStdOutSink
-will be used. You need to use the Sink according to the passed parameters to the _canary_ command.
-As an example you have to set `hbase.canary.sink.class` property to
-`org.apache.hadoop.hbase.tool.Canary$RegionStdOutSink` for using table parameters.
+The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property.
-This tool will return non zero error codes to user for collaborating with other monitoring tools, such as Nagios.
-The error code definitions are:
+This tool will return non zero error codes to user for collaborating with other monitoring tools,
+such as Nagios. The error code definitions are:
[source,java]
----
@@ -131,9 +140,9 @@ private static final int ERROR_EXIT_CODE = 4;
private static final int FAILURE_EXIT_CODE = 5;
----
-Here are some examples based on the following given case.
-There are two Table objects called test-01 and test-02, they have two column family cf1 and cf2 respectively, and deployed on the 3 RegionServers.
-see following table.
+Here are some examples based on the following given case: given two Table objects called test-01
+and test-02 each with two column family cf1 and cf2 respectively, deployed on 3 RegionServers.
+See the following table.
[cols="1,1,1", options="header"]
|===
@@ -145,7 +154,7 @@ see following table.
| rs3 | r2 | r1
|===
-Following are some examples based on the previous given case.
+Following are some example outputs based on the previous given case.
==== Canary test for every column family (store) of every region of every table
@@ -163,12 +172,13 @@ $ ${HBASE_HOME}/bin/hbase canary
13/12/09 03:26:32 INFO tool.Canary: read from region test-02,0004883,1386559511167.cbda32d5e2e276520712d84eaaa29d84. column family cf2 in 8ms
----
-So you can see, table test-01 has two regions and two column families, so the Canary tool will pick 4 small piece of data from 4 (2 region * 2 store) different stores.
-This is a default behavior of the this tool does.
+So you can see, table test-01 has two regions and two column families, so the Canary tool in the
+default "region mode" will pick 4 small piece of data from 4 (2 region * 2 store) different stores.
+This is a default behavior.
-==== Canary test for every column family (store) of every region of specific table(s)
+==== Canary test for every column family (store) of every region of a specific table(s)
-You can also test one or more specific tables.
+You can also test one or more specific tables by passing table names.
----
$ ${HBASE_HOME}/bin/hbase canary test-01 test-02
@@ -176,7 +186,9 @@ $ ${HBASE_HOME}/bin/hbase canary test-01 test-02
==== Canary test with RegionServer granularity
-This will pick one small piece of data from each RegionServer, and can also put your RegionServer name as input options for canary-test specific RegionServer.
+In "regionserver mode", the Canary tool will pick one small piece of data
+from each RegionServer (You can also pass one or more RegionServer names as arguments
+to the canary-test when in "regionserver mode").
----
$ ${HBASE_HOME}/bin/hbase canary -regionserver
@@ -188,22 +200,25 @@ $ ${HBASE_HOME}/bin/hbase canary -regionserver
==== Canary test with regular expression pattern
-This will test both table test-01 and test-02.
+You can pass regexes for table names when in "region mode" or for servernames when
+in "regionserver mode". The below will test both table test-01 and test-02.
----
$ ${HBASE_HOME}/bin/hbase canary -e test-0[1-2]
----
-==== Run canary test as daemon mode
+==== Run canary test as a "daemon"
-Run repeatedly with interval defined in option `-interval` whose default value is 60 seconds.
-This daemon will stop itself and return non-zero error code if any error occurs, due to the default value of option -f is true.
+Run repeatedly with an interval defined via the option `-interval` (default value is 60 seconds).
+This daemon will stop itself and return non-zero error code if any error occur. To have
+the daemon keep running across errors, pass the -f flag with its value set to false
+(see usage above).
----
$ ${HBASE_HOME}/bin/hbase canary -daemon
----
-Run repeatedly with 5 second intervals and will not stop itself even if errors occur in the test.
+To run repeatedly with 5 second intervals and not stop on errors, do the following.
----
$ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false
@@ -211,9 +226,11 @@ $ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false
==== Force timeout if canary test stuck
-In some cases the request is stuck and no response is sent back to the client. This can happen with dead RegionServers which the master has not yet noticed.
-Because of this we provide a timeout option to kill the canary test and return a non-zero error code.
-This run sets the timeout value to 60 seconds, the default value is 600 seconds.
+In some cases the request is stuck and no response is sent back to the client. This
+can happen with dead RegionServers which the master has not yet noticed.
+Because of this we provide a timeout option to kill the canary test and return a
+non-zero error code. The below sets the timeout value to 60 seconds (the default value
+is 600 seconds).
----
$ ${HBASE_HOME}/bin/hbase canary -t 60000
@@ -221,36 +238,37 @@ $ ${HBASE_HOME}/bin/hbase canary -t 60000
==== Enable write sniffing in canary
-By default, the canary tool only check the read operations, it's hard to find the problem in the
-write path. To enable the write sniffing, you can run canary with the `-writeSniffing` option.
-When the write sniffing is enabled, the canary tool will create an hbase table and make sure the
-regions of the table distributed on all region servers. In each sniffing period, the canary will
-try to put data to these regions to check the write availability of each region server.
+By default, the canary tool only checks read operations. To enable the write sniffing,
+you can run the canary with the `-writeSniffing` option set. When write sniffing is
+enabled, the canary tool will create an hbase table and make sure the
+regions of the table are distributed to all region servers. In each sniffing period,
+the canary will try to put data to these regions to check the write availability of
+each region server.
----
$ ${HBASE_HOME}/bin/hbase canary -writeSniffing
----
-The default write table is `hbase:canary` and can be specified by the option `-writeTable`.
+The default write table is `hbase:canary` and can be specified with the option `-writeTable`.
----
$ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary
----
-The default value size of each put is 10 bytes and you can set it by the config key:
+The default value size of each put is 10 bytes. You can set it via the config key:
`hbase.canary.write.value.size`.
==== Treat read / write failure as error
-By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException,
-while returning normal exit code. To treat read / write failure as error, you can run canary
-with the `-treatFailureAsError` option. When enabled, read / write failure would result in error
-exit code.
+By default, the canary tool only logs read failures -- due to e.g. RetriesExhaustedException, etc. --
+and will return the 'normal' exit code. To treat read/write failure as errors, you can run canary
+with the `-treatFailureAsError` option. When enabled, read/write failures will result in an
+error exit code.
----
$ ${HBASE_HOME}/bin/hbase canary -treatFailureAsError
----
==== Running Canary in a Kerberos-enabled Cluster
-To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:
+To run the Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:
* `hbase.client.keytab.file`
* `hbase.client.kerberos.principal`