HBASE-21320 [canary] Cleanup of usage and add commentary

Signed-off-by: Peter Somogyi <psomogyi@cloudera.com>
This commit is contained in:
Michael Stack 2018-10-15 22:09:17 -07:00
parent 1a0e1039a2
commit c67f7f14e2
No known key found for this signature in database
GPG Key ID: 9816C7FC8ACC93D2
4 changed files with 380 additions and 328 deletions

View File

@ -1367,6 +1367,7 @@ public final class HConstants {
"hbase.regionserver.region.split.threads.max"; "hbase.regionserver.region.split.threads.max";
/** Canary config keys */ /** Canary config keys */
// TODO: Move these defines to Canary Class
public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl"; public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl";
public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY = public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY =

View File

@ -90,6 +90,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKConfig;
import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.ToolRunner;
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceAudience;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooKeeper;
@ -101,39 +102,45 @@ import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists; import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
/** /**
* HBase Canary Tool, that that can be used to do * HBase Canary Tool for "canary monitoring" of a running HBase cluster.
* "canary monitoring" of a running HBase cluster.
* *
* Here are three modes * There are three modes:
* 1. region mode - Foreach region tries to get one row per column family * <ol>
* and outputs some information about failure or latency. * <li>region mode (Default): For each region, try to get one row per column family outputting
* information on failure (ERROR) or else the latency.
* </li>
* *
* 2. regionserver mode - Foreach regionserver tries to get one row from one table * <li>regionserver mode: For each regionserver try to get one row from one table selected
* selected randomly and outputs some information about failure or latency. * randomly outputting information on failure (ERROR) or else the latency.
* </li>
* *
* 3. zookeeper mode - for each zookeeper instance, selects a zNode and * <li>zookeeper mode: for each zookeeper instance, selects a znode outputting information on
* outputs some information about failure or latency. * failure (ERROR) or else the latency.
* </li>
* </ol>
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
public final class Canary implements Tool { public final class Canary implements Tool {
// Sink interface used by the canary to outputs information /**
* Sink interface used by the canary to output information
*/
public interface Sink { public interface Sink {
public long getReadFailureCount(); long getReadFailureCount();
public long incReadFailureCount(); long incReadFailureCount();
public Map<String,String> getReadFailures(); Map<String,String> getReadFailures();
public void updateReadFailures(String regionName, String serverName); void updateReadFailures(String regionName, String serverName);
public long getWriteFailureCount(); long getWriteFailureCount();
public long incWriteFailureCount(); long incWriteFailureCount();
public Map<String,String> getWriteFailures(); Map<String,String> getWriteFailures();
public void updateWriteFailures(String regionName, String serverName); void updateWriteFailures(String regionName, String serverName);
} }
// Simple implementation of canary sink that allows to plot on /**
// file or standard output timings or failures. * Simple implementation of canary sink that allows plotting to a file or standard output.
*/
public static class StdOutSink implements Sink { public static class StdOutSink implements Sink {
private AtomicLong readFailureCount = new AtomicLong(0), private AtomicLong readFailureCount = new AtomicLong(0),
writeFailureCount = new AtomicLong(0); writeFailureCount = new AtomicLong(0);
private Map<String, String> readFailures = new ConcurrentHashMap<>(); private Map<String, String> readFailures = new ConcurrentHashMap<>();
private Map<String, String> writeFailures = new ConcurrentHashMap<>(); private Map<String, String> writeFailures = new ConcurrentHashMap<>();
@ -178,67 +185,75 @@ public final class Canary implements Tool {
} }
} }
/**
* By RegionServer, for 'regionserver' mode.
*/
public static class RegionServerStdOutSink extends StdOutSink { public static class RegionServerStdOutSink extends StdOutSink {
public void publishReadFailure(String table, String server) { public void publishReadFailure(String table, String server) {
incReadFailureCount(); incReadFailureCount();
LOG.error(String.format("Read from table:%s on region server:%s", table, server)); LOG.error("Read from {} on {}", table, server);
} }
public void publishReadTiming(String table, String server, long msTime) { public void publishReadTiming(String table, String server, long msTime) {
LOG.info(String.format("Read from table:%s on region server:%s in %dms", LOG.info("Read from {} on {} in {}ms", table, server, msTime);
table, server, msTime));
} }
} }
/**
* Output for 'zookeeper' mode.
*/
public static class ZookeeperStdOutSink extends StdOutSink { public static class ZookeeperStdOutSink extends StdOutSink {
public void publishReadFailure(String znode, String server) {
public void publishReadFailure(String zNode, String server) {
incReadFailureCount(); incReadFailureCount();
LOG.error(String.format("Read from zNode:%s on zookeeper instance:%s", zNode, server)); LOG.error("Read from {} on {}", znode, server);
} }
public void publishReadTiming(String znode, String server, long msTime) { public void publishReadTiming(String znode, String server, long msTime) {
LOG.info(String.format("Read from zNode:%s on zookeeper instance:%s in %dms", LOG.info("Read from {} on {} in {}ms", znode, server, msTime);
znode, server, msTime));
} }
} }
/**
* By Region, for 'region' mode.
*/
public static class RegionStdOutSink extends StdOutSink { public static class RegionStdOutSink extends StdOutSink {
private Map<String, LongAdder> perTableReadLatency = new HashMap<>(); private Map<String, LongAdder> perTableReadLatency = new HashMap<>();
private LongAdder writeLatency = new LongAdder(); private LongAdder writeLatency = new LongAdder();
public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) { public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
incReadFailureCount(); incReadFailureCount();
LOG.error(String.format("read from region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e); LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
} }
public void publishReadFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) { public void publishReadFailure(ServerName serverName, RegionInfo region,
ColumnFamilyDescriptor column, Exception e) {
incReadFailureCount(); incReadFailureCount();
LOG.error(String.format("read from region %s on regionserver %s column family %s failed", LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
region.getRegionNameAsString(), serverName, column.getNameAsString()), e); column.getNameAsString(), e);
} }
public void publishReadTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) { public void publishReadTiming(ServerName serverName, RegionInfo region,
LOG.info(String.format("read from region %s on regionserver %s column family %s in %dms", ColumnFamilyDescriptor column, long msTime) {
region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime)); LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName,
column.getNameAsString(), msTime);
} }
public void publishWriteFailure(ServerName serverName, RegionInfo region, Exception e) { public void publishWriteFailure(ServerName serverName, RegionInfo region, Exception e) {
incWriteFailureCount(); incWriteFailureCount();
LOG.error(String.format("write to region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e); LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e);
} }
public void publishWriteFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) { public void publishWriteFailure(ServerName serverName, RegionInfo region,
ColumnFamilyDescriptor column, Exception e) {
incWriteFailureCount(); incWriteFailureCount();
LOG.error(String.format("write to region %s on regionserver %s column family %s failed", LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName,
region.getRegionNameAsString(), serverName, column.getNameAsString()), e); column.getNameAsString(), e);
} }
public void publishWriteTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) { public void publishWriteTiming(ServerName serverName, RegionInfo region,
LOG.info(String.format("write to region %s on regionserver %s column family %s in %dms", ColumnFamilyDescriptor column, long msTime) {
region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime)); LOG.info("Write to {} on {} {} in {}ms",
region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime);
} }
public Map<String, LongAdder> getReadLatencyMap() { public Map<String, LongAdder> getReadLatencyMap() {
@ -260,6 +275,9 @@ public final class Canary implements Tool {
} }
} }
/**
* Run a single zookeeper Task and then exit.
*/
static class ZookeeperTask implements Callable<Void> { static class ZookeeperTask implements Callable<Void> {
private final Connection connection; private final Connection connection;
private final String host; private final String host;
@ -298,8 +316,8 @@ public final class Canary implements Tool {
} }
/** /**
* For each column family of the region tries to get one row and outputs the latency, or the * Run a single Region Task and then exit. For each column family of the Region, get one row and
* failure. * output latency or failure.
*/ */
static class RegionTask implements Callable<Void> { static class RegionTask implements Callable<Void> {
public enum TaskType{ public enum TaskType{
@ -313,8 +331,8 @@ public final class Canary implements Tool {
private ServerName serverName; private ServerName serverName;
private LongAdder readWriteLatency; private LongAdder readWriteLatency;
RegionTask(Connection connection, RegionInfo region, ServerName serverName, RegionStdOutSink sink, RegionTask(Connection connection, RegionInfo region, ServerName serverName,
TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) { RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) {
this.connection = connection; this.connection = connection;
this.region = region; this.region = region;
this.serverName = serverName; this.serverName = serverName;
@ -340,14 +358,11 @@ public final class Canary implements Tool {
Table table = null; Table table = null;
TableDescriptor tableDesc = null; TableDescriptor tableDesc = null;
try { try {
if (LOG.isDebugEnabled()) { LOG.debug("Reading table descriptor for table {}", region.getTable());
LOG.debug(String.format("reading table descriptor for table %s",
region.getTable()));
}
table = connection.getTable(region.getTable()); table = connection.getTable(region.getTable());
tableDesc = table.getDescriptor(); tableDesc = table.getDescriptor();
} catch (IOException e) { } catch (IOException e) {
LOG.debug("sniffRegion failed", e); LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e);
sink.publishReadFailure(serverName, region, e); sink.publishReadFailure(serverName, region, e);
if (table != null) { if (table != null) {
try { try {
@ -375,10 +390,7 @@ public final class Canary implements Tool {
get.addFamily(column.getName()); get.addFamily(column.getName());
} else { } else {
scan = new Scan(); scan = new Scan();
if (LOG.isDebugEnabled()) { LOG.debug("rawScan {} for {}", rawScanEnabled, tableDesc.getTableName());
LOG.debug(String.format("rawScan : %s for table: %s", rawScanEnabled,
tableDesc.getTableName()));
}
scan.setRaw(rawScanEnabled); scan.setRaw(rawScanEnabled);
scan.setCaching(1); scan.setCaching(1);
scan.setCacheBlocks(false); scan.setCacheBlocks(false);
@ -387,12 +399,9 @@ public final class Canary implements Tool {
scan.setMaxResultSize(1L); scan.setMaxResultSize(1L);
scan.setOneRowLimit(); scan.setOneRowLimit();
} }
LOG.debug("Reading from {} {} {} {}", tableDesc.getTableName(),
if (LOG.isDebugEnabled()) { region.getRegionNameAsString(), column.getNameAsString(),
LOG.debug(String.format("reading from table %s region %s column family %s and key %s", Bytes.toStringBinary(startKey));
tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
Bytes.toStringBinary(startKey)));
}
try { try {
stopWatch.start(); stopWatch.start();
if (startKey.length > 0) { if (startKey.length > 0) {
@ -425,7 +434,6 @@ public final class Canary implements Tool {
/** /**
* Check writes for the canary table * Check writes for the canary table
* @return
*/ */
private Void write() { private Void write() {
Table table = null; Table table = null;
@ -445,11 +453,9 @@ public final class Canary implements Tool {
Bytes.random(value); Bytes.random(value);
put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value); put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
if (LOG.isDebugEnabled()) { LOG.debug("Writing to {} {} {} {}",
LOG.debug(String.format("writing to table %s region %s column family %s and key %s", tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(), Bytes.toStringBinary(rowToCheck));
Bytes.toStringBinary(rowToCheck)));
}
try { try {
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
table.put(put); table.put(put);
@ -470,7 +476,8 @@ public final class Canary implements Tool {
} }
/** /**
* Get one row from a region on the regionserver and outputs the latency, or the failure. * Run a single RegionServer Task and then exit.
* Get one row from a region on the regionserver and output latency or the failure.
*/ */
static class RegionServerTask implements Callable<Void> { static class RegionServerTask implements Callable<Void> {
private Connection connection; private Connection connection;
@ -503,11 +510,9 @@ public final class Canary implements Tool {
table = connection.getTable(tableName); table = connection.getTable(tableName);
startKey = region.getStartKey(); startKey = region.getStartKey();
// Can't do a get on empty start row so do a Scan of first element if any instead. // Can't do a get on empty start row so do a Scan of first element if any instead.
if (LOG.isDebugEnabled()) { LOG.debug("Reading from {} {} {} {}",
LOG.debug(String.format("reading from region server %s table %s region %s and key %s", serverName, region.getTable(), region.getRegionNameAsString(),
serverName, region.getTable(), region.getRegionNameAsString(), Bytes.toStringBinary(startKey));
Bytes.toStringBinary(startKey)));
}
if (startKey.length > 0) { if (startKey.length > 0) {
get = new Get(startKey); get = new Get(startKey);
get.setCacheBlocks(false); get.setCacheBlocks(false);
@ -584,23 +589,43 @@ public final class Canary implements Tool {
private boolean useRegExp; private boolean useRegExp;
private long timeout = DEFAULT_TIMEOUT; private long timeout = DEFAULT_TIMEOUT;
private boolean failOnError = true; private boolean failOnError = true;
/**
* True if we are to run in 'regionServer' mode.
*/
private boolean regionServerMode = false; private boolean regionServerMode = false;
/**
* True if we are to run in zookeeper 'mode'.
*/
private boolean zookeeperMode = false; private boolean zookeeperMode = false;
private long permittedFailures = 0; private long permittedFailures = 0;
private boolean regionServerAllRegions = false; private boolean regionServerAllRegions = false;
private boolean writeSniffing = false; private boolean writeSniffing = false;
private long configuredWriteTableTimeout = DEFAULT_TIMEOUT; private long configuredWriteTableTimeout = DEFAULT_TIMEOUT;
private boolean treatFailureAsError = false; private boolean treatFailureAsError = false;
private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME; private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
/**
* This is a Map of table to timeout. The timeout is for reading all regions in the table; i.e.
* we aggregate time to fetch each region and it needs to be less than this value else we
* log an ERROR.
*/
private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>(); private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>();
private ExecutorService executor; // threads to retrieve data from regionservers private ExecutorService executor; // threads to retrieve data from regionservers
public Canary() { public Canary() {
this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink()); this(new ScheduledThreadPoolExecutor(1));
} }
public Canary(ExecutorService executor, Sink sink) { public Canary(ExecutorService executor) {
this(executor, null);
}
@VisibleForTesting
Canary(ExecutorService executor, Sink sink) {
this.executor = executor; this.executor = executor;
this.sink = sink; this.sink = sink;
} }
@ -628,7 +653,7 @@ public final class Canary implements Tool {
printUsageAndExit(); printUsageAndExit();
} }
if (cmd.equals("-help")) { if (cmd.equals("-help") || cmd.equals("-h")) {
// user asked for help, print the help and quit. // user asked for help, print the help and quit.
printUsageAndExit(); printUsageAndExit();
} else if (cmd.equals("-daemon") && interval == 0) { } else if (cmd.equals("-daemon") && interval == 0) {
@ -639,7 +664,7 @@ public final class Canary implements Tool {
i++; i++;
if (i == args.length) { if (i == args.length) {
System.err.println("-interval needs a numeric value argument."); System.err.println("-interval takes a numeric seconds value argument.");
printUsageAndExit(); printUsageAndExit();
} }
@ -657,7 +682,7 @@ public final class Canary implements Tool {
this.regionServerAllRegions = true; this.regionServerAllRegions = true;
} else if(cmd.equals("-writeSniffing")) { } else if(cmd.equals("-writeSniffing")) {
this.writeSniffing = true; this.writeSniffing = true;
} else if(cmd.equals("-treatFailureAsError")) { } else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) {
this.treatFailureAsError = true; this.treatFailureAsError = true;
} else if (cmd.equals("-e")) { } else if (cmd.equals("-e")) {
this.useRegExp = true; this.useRegExp = true;
@ -665,35 +690,35 @@ public final class Canary implements Tool {
i++; i++;
if (i == args.length) { if (i == args.length) {
System.err.println("-t needs a numeric value argument."); System.err.println("-t takes a numeric milliseconds value argument.");
printUsageAndExit(); printUsageAndExit();
} }
try { try {
this.timeout = Long.parseLong(args[i]); this.timeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
System.err.println("-t needs a numeric value argument."); System.err.println("-t takes a numeric milliseconds value argument.");
printUsageAndExit(); printUsageAndExit();
} }
} else if(cmd.equals("-writeTableTimeout")) { } else if(cmd.equals("-writeTableTimeout")) {
i++; i++;
if (i == args.length) { if (i == args.length) {
System.err.println("-writeTableTimeout needs a numeric value argument."); System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
printUsageAndExit(); printUsageAndExit();
} }
try { try {
this.configuredWriteTableTimeout = Long.parseLong(args[i]); this.configuredWriteTableTimeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
System.err.println("-writeTableTimeout needs a numeric value argument."); System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
printUsageAndExit(); printUsageAndExit();
} }
} else if (cmd.equals("-writeTable")) { } else if (cmd.equals("-writeTable")) {
i++; i++;
if (i == args.length) { if (i == args.length) {
System.err.println("-writeTable needs a string value argument."); System.err.println("-writeTable takes a string tablename value argument.");
printUsageAndExit(); printUsageAndExit();
} }
this.writeTableName = TableName.valueOf(args[i]); this.writeTableName = TableName.valueOf(args[i]);
@ -711,14 +736,16 @@ public final class Canary implements Tool {
i++; i++;
if (i == args.length) { if (i == args.length) {
System.err.println("-readTableTimeouts needs a comma-separated list of read timeouts per table (without spaces)."); System.err.println("-readTableTimeouts needs a comma-separated list of read " +
"millisecond timeouts per table (without spaces).");
printUsageAndExit(); printUsageAndExit();
} }
String [] tableTimeouts = args[i].split(","); String [] tableTimeouts = args[i].split(",");
for (String tT: tableTimeouts) { for (String tT: tableTimeouts) {
String [] nameTimeout = tT.split("="); String [] nameTimeout = tT.split("=");
if (nameTimeout.length < 2) { if (nameTimeout.length < 2) {
System.err.println("Each -readTableTimeouts argument must be of the form <tableName>=<read timeout>."); System.err.println("Each -readTableTimeouts argument must be of the form " +
"<tableName>=<read timeout> (without spaces).");
printUsageAndExit(); printUsageAndExit();
} }
long timeoutVal = 0L; long timeoutVal = 0L;
@ -856,41 +883,56 @@ public final class Canary implements Tool {
private void printUsageAndExit() { private void printUsageAndExit() {
System.err.println( System.err.println(
"Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]"); "Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]");
System.err.println(" where [opts] are:"); System.err.println("Where [OPTIONS] are:");
System.err.println(" -help Show this help and exit."); System.err.println(" -h,-help show this help and exit.");
System.err.println(" -regionserver replace the table argument to regionserver,"); System.err.println(" -regionserver set 'regionserver mode'; gets row from random region on " +
System.err.println(" which means to enable regionserver mode"); "server");
System.err.println(" -allRegions Tries all regions on a regionserver,"); System.err.println(" -allRegions get from ALL regions when 'regionserver mode', not just " +
System.err.println(" only works in regionserver mode."); "random one.");
System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent "); System.err.println(" -zookeeper set 'zookeeper mode'; grab zookeeper.znode.parent on " +
System.err.println(" on each zookeeper instance"); "each ensemble member");
System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures when attempting to "); System.err.println(" -daemon continuous check at defined intervals.");
System.err.println(" connect to individual zookeeper nodes in the ensemble"); System.err.println(" -interval <N> interval between checks in seconds");
System.err.println(" -daemon Continuous check at defined intervals."); System.err.println(" -e consider table/regionserver argument as regular " +
System.err.println(" -interval <N> Interval between checks (sec)"); "expression");
System.err.println(" -e Use table/regionserver as regular expression"); System.err.println(" -f <B> exit on first error; default=true");
System.err.println(" which means the table/regionserver is regular expression pattern"); System.err.println(" -failureAsError treat read/write failure as error");
System.err.println(" -f <B> stop whole program if first error occurs," + System.err.println(" -t <N> timeout for canary-test run; default=600000ms");
" default is true"); System.err.println(" -writeSniffing enable write sniffing");
System.err.println(" -t <N> timeout for a check, default is 600000 (millisecs)"); System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary");
System.err.println(" -writeTableTimeout <N> write timeout for the writeTable, default is 600000 (millisecs)"); System.err.println(" -writeTableTimeout <N> timeout for writeTable; default=600000ms");
System.err.println(" -readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>, ... " System.err.println(" -readTableTimeouts <tableName>=<read timeout>," +
+ "comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)"); "<tableName>=<read timeout>,...");
System.err.println(" -writeSniffing enable the write sniffing in canary"); System.err.println(" comma-separated list of table read timeouts " +
System.err.println(" -treatFailureAsError treats read / write failure as error"); "(no spaces);");
System.err.println(" -writeTable The table used for write sniffing." System.err.println(" logs 'ERROR' if takes longer. default=600000ms");
+ " Default is hbase:canary"); System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures attempting to ");
System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Use this flag to enable or disable raw scan during read canary test" System.err.println(" connect to individual zookeeper nodes in ensemble");
+ " Default is false and raw is not enabled during scan"); System.err.println("");
System.err System.err.println(" -D<configProperty>=<value> to assign or override configuration params");
.println(" -D<configProperty>=<value> assigning or override the configuration params"); System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable " +
"raw scan; default=false");
System.err.println("");
System.err.println("Canary runs in one of three modes: region (default), regionserver, or " +
"zookeeper.");
System.err.println("To sniff/probe all regions, pass no arguments.");
System.err.println("To sniff/probe all regions of a table, pass tablename.");
System.err.println("To sniff/probe regionservers, pass -regionserver, etc.");
System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation.");
System.exit(USAGE_EXIT_CODE); System.exit(USAGE_EXIT_CODE);
} }
Sink getSink(Configuration configuration, Class clazz) {
// In test context, this.sink might be set. Use it if non-null. For testing.
return this.sink != null? this.sink:
(Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class",
clazz, Sink.class));
}
/** /**
* A Factory method for {@link Monitor}. * A Factory method for {@link Monitor}.
* Can be overridden by user. * Makes a RegionServerMonitor, or a ZooKeeperMonitor, or a RegionMonitor.
* @param index a start index for monitor target * @param index a start index for monitor target
* @param args args passed from user * @param args args passed from user
* @return a Monitor instance * @return a Monitor instance
@ -899,37 +941,45 @@ public final class Canary implements Tool {
Monitor monitor = null; Monitor monitor = null;
String[] monitorTargets = null; String[] monitorTargets = null;
if(index >= 0) { if (index >= 0) {
int length = args.length - index; int length = args.length - index;
monitorTargets = new String[length]; monitorTargets = new String[length];
System.arraycopy(args, index, monitorTargets, 0, length); System.arraycopy(args, index, monitorTargets, 0, length);
} }
if (this.sink instanceof RegionServerStdOutSink || this.regionServerMode) { if (this.regionServerMode) {
monitor = monitor =
new RegionServerMonitor(connection, monitorTargets, this.useRegExp, new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
(StdOutSink) this.sink, this.executor, this.regionServerAllRegions, getSink(connection.getConfiguration(), RegionServerStdOutSink.class),
this.executor, this.regionServerAllRegions,
this.treatFailureAsError, this.permittedFailures); this.treatFailureAsError, this.permittedFailures);
} else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) { } else if (this.zookeeperMode) {
monitor = monitor =
new ZookeeperMonitor(connection, monitorTargets, this.useRegExp, new ZookeeperMonitor(connection, monitorTargets, this.useRegExp,
(StdOutSink) this.sink, this.executor, this.treatFailureAsError, getSink(connection.getConfiguration(), ZookeeperStdOutSink.class),
this.executor, this.treatFailureAsError,
this.permittedFailures); this.permittedFailures);
} else { } else {
monitor = monitor =
new RegionMonitor(connection, monitorTargets, this.useRegExp, new RegionMonitor(connection, monitorTargets, this.useRegExp,
(StdOutSink) this.sink, this.executor, this.writeSniffing, getSink(connection.getConfiguration(), RegionStdOutSink.class),
this.executor, this.writeSniffing,
this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts, this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts,
this.configuredWriteTableTimeout, this.permittedFailures); this.configuredWriteTableTimeout, this.permittedFailures);
} }
return monitor; return monitor;
} }
// a Monitor super-class can be extended by users /**
* A Monitor super-class can be extended by users
*/
public static abstract class Monitor implements Runnable, Closeable { public static abstract class Monitor implements Runnable, Closeable {
protected Connection connection; protected Connection connection;
protected Admin admin; protected Admin admin;
/**
* 'Target' dependent on 'mode'. Could be Tables or RegionServers or ZNodes.
* Passed on the command-line as arguments.
*/
protected String[] targets; protected String[] targets;
protected boolean useRegExp; protected boolean useRegExp;
protected boolean treatFailureAsError; protected boolean treatFailureAsError;
@ -999,7 +1049,9 @@ public final class Canary implements Tool {
} }
} }
// a monitor for region mode /**
* A monitor for region mode.
*/
private static class RegionMonitor extends Monitor { private static class RegionMonitor extends Monitor {
// 10 minutes // 10 minutes
private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000; private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
@ -1014,14 +1066,22 @@ public final class Canary implements Tool {
private float regionsUpperLimit; private float regionsUpperLimit;
private int checkPeriod; private int checkPeriod;
private boolean rawScanEnabled; private boolean rawScanEnabled;
/**
* This is a timeout per table. If read of each region in the table aggregated takes longer
* than what is configured here, we log an ERROR rather than just an INFO.
*/
private HashMap<String, Long> configuredReadTableTimeouts; private HashMap<String, Long> configuredReadTableTimeouts;
private long configuredWriteTableTimeout; private long configuredWriteTableTimeout;
public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts, long configuredWriteTableTimeout, boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts,
long configuredWriteTableTimeout,
long allowedFailures) { long allowedFailures) {
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
allowedFailures);
Configuration conf = connection.getConfiguration(); Configuration conf = connection.getConfiguration();
this.writeSniffing = writeSniffing; this.writeSniffing = writeSniffing;
this.writeTableName = writeTableName; this.writeTableName = writeTableName;
@ -1054,9 +1114,12 @@ public final class Canary implements Tool {
RegionStdOutSink regionSink = this.getSink(); RegionStdOutSink regionSink = this.getSink();
if (this.targets != null && this.targets.length > 0) { if (this.targets != null && this.targets.length > 0) {
String[] tables = generateMonitorTables(this.targets); String[] tables = generateMonitorTables(this.targets);
// Check to see that each table name passed in the -readTableTimeouts argument is also passed as a monitor target. // Check to see that each table name passed in the -readTableTimeouts argument is also
if (! new HashSet<>(Arrays.asList(tables)).containsAll(this.configuredReadTableTimeouts.keySet())) { // passed as a monitor target.
LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets passed via command line."); if (!new HashSet<>(Arrays.asList(tables)).
containsAll(this.configuredReadTableTimeouts.keySet())) {
LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " +
"passed via command line.");
this.errorCode = USAGE_EXIT_CODE; this.errorCode = USAGE_EXIT_CODE;
return; return;
} }
@ -1082,7 +1145,7 @@ public final class Canary implements Tool {
// sniff canary table with write operation // sniff canary table with write operation
regionSink.initializeWriteLatency(); regionSink.initializeWriteLatency();
LongAdder writeTableLatency = regionSink.getWriteLatency(); LongAdder writeTableLatency = regionSink.getWriteLatency();
taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName), taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getDescriptor(writeTableName),
executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency)); executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency));
} }
@ -1099,23 +1162,26 @@ public final class Canary implements Tool {
if (actualReadTableLatency.containsKey(tableName)) { if (actualReadTableLatency.containsKey(tableName)) {
Long actual = actualReadTableLatency.get(tableName).longValue(); Long actual = actualReadTableLatency.get(tableName).longValue();
Long configured = entry.getValue(); Long configured = entry.getValue();
LOG.info("Read operation for " + tableName + " took " + actual +
" ms. The configured read timeout was " + configured + " ms.");
if (actual > configured) { if (actual > configured) {
LOG.error("Read operation for " + tableName + " exceeded the configured read timeout."); LOG.error("Read operation for {} took {}ms (Configured read timeout {}ms.",
tableName, actual, configured);
} else {
LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.",
tableName, actual, configured);
} }
} else { } else {
LOG.error("Read operation for " + tableName + " failed!"); LOG.error("Read operation for {} failed!", tableName);
} }
} }
if (this.writeSniffing) { if (this.writeSniffing) {
String writeTableStringName = this.writeTableName.getNameAsString(); String writeTableStringName = this.writeTableName.getNameAsString();
long actualWriteLatency = regionSink.getWriteLatency().longValue(); long actualWriteLatency = regionSink.getWriteLatency().longValue();
LOG.info("Write operation for " + writeTableStringName + " took " + actualWriteLatency + " ms. The configured write timeout was " + LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.",
this.configuredWriteTableTimeout + " ms."); writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout);
// Check that the writeTable write operation latency does not exceed the configured timeout. // Check that the writeTable write operation latency does not exceed the configured timeout.
if (actualWriteLatency > this.configuredWriteTableTimeout) { if (actualWriteLatency > this.configuredWriteTableTimeout) {
LOG.error("Write operation for " + writeTableStringName + " exceeded the configured write timeout."); LOG.error("Write operation for {} exceeded the configured write timeout.",
writeTableStringName);
} }
} }
} catch (Exception e) { } catch (Exception e) {
@ -1123,31 +1189,32 @@ public final class Canary implements Tool {
this.errorCode = ERROR_EXIT_CODE; this.errorCode = ERROR_EXIT_CODE;
} finally { } finally {
this.done = true; this.done = true;
} }
} }
this.done = true; this.done = true;
} }
/**
* @return List of tables to use in test.
*/
private String[] generateMonitorTables(String[] monitorTargets) throws IOException { private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
String[] returnTables = null; String[] returnTables = null;
if (this.useRegExp) { if (this.useRegExp) {
Pattern pattern = null; Pattern pattern = null;
HTableDescriptor[] tds = null; TableDescriptor[] tds = null;
Set<String> tmpTables = new TreeSet<>(); Set<String> tmpTables = new TreeSet<>();
try { try {
if (LOG.isDebugEnabled()) { LOG.debug(String.format("reading list of tables"));
LOG.debug(String.format("reading list of tables"));
}
tds = this.admin.listTables(pattern); tds = this.admin.listTables(pattern);
if (tds == null) { if (tds == null) {
tds = new HTableDescriptor[0]; tds = new TableDescriptor[0];
} }
for (String monitorTarget : monitorTargets) { for (String monitorTarget : monitorTargets) {
pattern = Pattern.compile(monitorTarget); pattern = Pattern.compile(monitorTarget);
for (HTableDescriptor td : tds) { for (TableDescriptor td : tds) {
if (pattern.matcher(td.getNameAsString()).matches()) { if (pattern.matcher(td.getTableName().getNameAsString()).matches()) {
tmpTables.add(td.getNameAsString()); tmpTables.add(td.getTableName().getNameAsString());
} }
} }
} }
@ -1172,18 +1239,19 @@ public final class Canary implements Tool {
} }
/* /*
* canary entry point to monitor all the tables. * Canary entry point to monitor all the tables.
*/ */
private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink) throws Exception { private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink)
if (LOG.isDebugEnabled()) { throws Exception {
LOG.debug(String.format("reading list of tables")); LOG.debug("Reading list of tables");
}
List<Future<Void>> taskFutures = new LinkedList<>(); List<Future<Void>> taskFutures = new LinkedList<>();
for (HTableDescriptor table : admin.listTables()) { for (TableDescriptor td: admin.listTableDescriptors()) {
if (admin.isTableEnabled(table.getTableName()) if (admin.isTableEnabled(td.getTableName()) &&
&& (!table.getTableName().equals(writeTableName))) { (!td.getTableName().equals(writeTableName))) {
LongAdder readLatency = regionSink.initializeAndGetReadLatencyForTable(table.getNameAsString()); LongAdder readLatency =
taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType, this.rawScanEnabled, readLatency)); regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString());
taskFutures.addAll(Canary.sniff(admin, sink, td, executor, taskType, this.rawScanEnabled,
readLatency));
} }
} }
return taskFutures; return taskFutures;
@ -1231,11 +1299,10 @@ public final class Canary implements Tool {
private void createWriteTable(int numberOfServers) throws IOException { private void createWriteTable(int numberOfServers) throws IOException {
int numberOfRegions = (int)(numberOfServers * regionsLowerLimit); int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
LOG.info("Number of live regionservers: " + numberOfServers + ", " LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " +
+ "pre-splitting the canary table into " + numberOfRegions + " regions " "(current lower limit of regions per server is {} and you can change it with config {}).",
+ "(current lower limit of regions per server is " + regionsLowerLimit numberOfServers, numberOfRegions, regionsLowerLimit,
+ " and you can change it by config: " HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY);
+ HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
HTableDescriptor desc = new HTableDescriptor(writeTableName); HTableDescriptor desc = new HTableDescriptor(writeTableName);
HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME); HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
family.setMaxVersions(1); family.setMaxVersions(1);
@ -1252,59 +1319,40 @@ public final class Canary implements Tool {
* @throws Exception * @throws Exception
*/ */
private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName, private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency) throws Exception { ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency)
if (LOG.isDebugEnabled()) { throws Exception {
LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s", LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName);
tableName));
}
if (admin.isTableEnabled(TableName.valueOf(tableName))) { if (admin.isTableEnabled(TableName.valueOf(tableName))) {
return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)), return Canary.sniff(admin, sink, admin.getDescriptor(TableName.valueOf(tableName)),
executor, taskType, rawScanEnabled, readLatency); executor, taskType, rawScanEnabled, readLatency);
} else { } else {
LOG.warn(String.format("Table %s is not enabled", tableName)); LOG.warn("Table {} is not enabled", tableName);
} }
return new LinkedList<>(); return new LinkedList<>();
} }
/* /*
* Loops over regions that owns this table, and output some information about the state. * Loops over regions of this table, and outputs information about the state.
*/ */
private static List<Future<Void>> sniff(final Admin admin, final Sink sink, private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType, TableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
boolean rawScanEnabled, LongAdder rwLatency) throws Exception { boolean rawScanEnabled, LongAdder rwLatency) throws Exception {
LOG.debug("Reading list of regions for table {}", tableDesc.getTableName());
if (LOG.isDebugEnabled()) { try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) {
LOG.debug(String.format("reading list of regions for table %s", tableDesc.getTableName())); List<RegionTask> tasks = new ArrayList<>();
} try (RegionLocator regionLocator =
admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
Table table = null; for (HRegionLocation location: regionLocator.getAllRegionLocations()) {
try { ServerName rs = location.getServerName();
table = admin.getConnection().getTable(tableDesc.getTableName()); RegionInfo region = location.getRegion();
tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink,
taskType, rawScanEnabled, rwLatency));
}
return executor.invokeAll(tasks);
}
} catch (TableNotFoundException e) { } catch (TableNotFoundException e) {
return new ArrayList<>(); return Collections.EMPTY_LIST;
} }
finally {
if (table !=null) {
table.close();
}
}
List<RegionTask> tasks = new ArrayList<>();
RegionLocator regionLocator = null;
try {
regionLocator = admin.getConnection().getRegionLocator(tableDesc.getTableName());
for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
ServerName rs = location.getServerName();
RegionInfo region = location.getRegionInfo();
tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink) sink, taskType, rawScanEnabled,
rwLatency));
}
} finally {
if (regionLocator != null) {
regionLocator.close();
}
}
return executor.invokeAll(tasks);
} }
// monitor for zookeeper mode // monitor for zookeeper mode
@ -1314,8 +1362,9 @@ public final class Canary implements Tool {
private final int timeout; private final int timeout;
protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); super(connection, monitorTargets, useRegExp,
sink, executor, treatFailureAsError, allowedFailures);
Configuration configuration = connection.getConfiguration(); Configuration configuration = connection.getConfiguration();
znode = znode =
configuration.get(ZOOKEEPER_ZNODE_PARENT, configuration.get(ZOOKEEPER_ZNODE_PARENT,
@ -1374,15 +1423,17 @@ public final class Canary implements Tool {
} }
// a monitor for regionserver mode /**
* A monitor for regionserver mode
*/
private static class RegionServerMonitor extends Monitor { private static class RegionServerMonitor extends Monitor {
private boolean allRegions; private boolean allRegions;
public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
StdOutSink sink, ExecutorService executor, boolean allRegions, Sink sink, ExecutorService executor, boolean allRegions,
boolean treatFailureAsError, long allowedFailures) { boolean treatFailureAsError, long allowedFailures) {
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
allowedFailures);
this.allRegions = allRegions; this.allRegions = allRegions;
} }
@ -1413,10 +1464,7 @@ public final class Canary implements Tool {
private boolean checkNoTableNames() { private boolean checkNoTableNames() {
List<String> foundTableNames = new ArrayList<>(); List<String> foundTableNames = new ArrayList<>();
TableName[] tableNames = null; TableName[] tableNames = null;
LOG.debug("Reading list of tables");
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("reading list of tables"));
}
try { try {
tableNames = this.admin.listTableNames(); tableNames = this.admin.listTableNames();
} catch (IOException e) { } catch (IOException e) {
@ -1452,7 +1500,7 @@ public final class Canary implements Tool {
AtomicLong successes = new AtomicLong(0); AtomicLong successes = new AtomicLong(0);
successMap.put(serverName, successes); successMap.put(serverName, successes);
if (entry.getValue().isEmpty()) { if (entry.getValue().isEmpty()) {
LOG.error(String.format("Regionserver not serving any regions - %s", serverName)); LOG.error("Regionserver not serving any regions - {}", serverName);
} else if (this.allRegions) { } else if (this.allRegions) {
for (RegionInfo region : entry.getValue()) { for (RegionInfo region : entry.getValue()) {
tasks.add(new RegionServerTask(this.connection, tasks.add(new RegionServerTask(this.connection,
@ -1483,8 +1531,8 @@ public final class Canary implements Tool {
if (this.allRegions) { if (this.allRegions) {
for (Map.Entry<String, List<RegionInfo>> entry : rsAndRMap.entrySet()) { for (Map.Entry<String, List<RegionInfo>> entry : rsAndRMap.entrySet()) {
String serverName = entry.getKey(); String serverName = entry.getKey();
LOG.info("Successfully read " + successMap.get(serverName) + " regions out of " LOG.info("Successfully read {} regions out of {} on regionserver {}",
+ entry.getValue().size() + " on regionserver:" + serverName); successMap.get(serverName), entry.getValue().size(), serverName);
} }
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -1501,36 +1549,30 @@ public final class Canary implements Tool {
private Map<String, List<RegionInfo>> getAllRegionServerByName() { private Map<String, List<RegionInfo>> getAllRegionServerByName() {
Map<String, List<RegionInfo>> rsAndRMap = new HashMap<>(); Map<String, List<RegionInfo>> rsAndRMap = new HashMap<>();
Table table = null;
RegionLocator regionLocator = null;
try { try {
if (LOG.isDebugEnabled()) { LOG.debug("Reading list of tables and locations");
LOG.debug(String.format("reading list of tables and locations")); List<TableDescriptor> tableDescs = this.admin.listTableDescriptors();
}
HTableDescriptor[] tableDescs = this.admin.listTables();
List<RegionInfo> regions = null; List<RegionInfo> regions = null;
for (HTableDescriptor tableDesc : tableDescs) { for (TableDescriptor tableDesc: tableDescs) {
table = this.admin.getConnection().getTable(tableDesc.getTableName()); try (RegionLocator regionLocator =
regionLocator = this.admin.getConnection().getRegionLocator(tableDesc.getTableName()); this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
for (HRegionLocation location : regionLocator.getAllRegionLocations()) { ServerName rs = location.getServerName();
ServerName rs = location.getServerName(); String rsName = rs.getHostname();
String rsName = rs.getHostname(); RegionInfo r = location.getRegion();
RegionInfo r = location.getRegionInfo(); if (rsAndRMap.containsKey(rsName)) {
regions = rsAndRMap.get(rsName);
if (rsAndRMap.containsKey(rsName)) { } else {
regions = rsAndRMap.get(rsName); regions = new ArrayList<>();
} else { rsAndRMap.put(rsName, regions);
regions = new ArrayList<>(); }
rsAndRMap.put(rsName, regions); regions.add(r);
} }
regions.add(r);
} }
table.close();
} }
// get any live regionservers not serving any regions // get any live regionservers not serving any regions
for (ServerName rs : this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) for (ServerName rs: this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS))
.getLiveServerMetrics().keySet()) { .getLiveServerMetrics().keySet()) {
String rsName = rs.getHostname(); String rsName = rs.getHostname();
if (!rsAndRMap.containsKey(rsName)) { if (!rsAndRMap.containsKey(rsName)) {
@ -1538,19 +1580,9 @@ public final class Canary implements Tool {
} }
} }
} catch (IOException e) { } catch (IOException e) {
String msg = "Get HTables info failed"; LOG.error("Get HTables info failed", e);
LOG.error(msg, e);
this.errorCode = INIT_ERROR_EXIT_CODE; this.errorCode = INIT_ERROR_EXIT_CODE;
} finally {
if (table != null) {
try {
table.close();
} catch (IOException e) {
LOG.warn("Close table failed", e);
}
}
} }
return rsAndRMap; return rsAndRMap;
} }
@ -1576,13 +1608,13 @@ public final class Canary implements Tool {
} }
} }
if (!regExpFound) { if (!regExpFound) {
LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName); LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName);
} }
} else { } else {
if (fullRsAndRMap.containsKey(rsName)) { if (fullRsAndRMap.containsKey(rsName)) {
filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName)); filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
} else { } else {
LOG.info("No RegionServerInfo found, regionServerName:" + rsName); LOG.info("No RegionServerInfo found, regionServerName {}", rsName);
} }
} }
} }
@ -1596,20 +1628,19 @@ public final class Canary implements Tool {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final Configuration conf = HBaseConfiguration.create(); final Configuration conf = HBaseConfiguration.create();
// loading the generic options to conf // Loading the generic options to conf
new GenericOptionsParser(conf, args); new GenericOptionsParser(conf, args);
int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM); int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
LOG.info("Number of execution threads " + numThreads); LOG.info("Execution thread count={}", numThreads);
int exitCode = 0;
ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads); ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
try {
Class<? extends Sink> sinkClass = exitCode = ToolRunner.run(conf, new Canary(executor), args);
conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class); } finally {
Sink sink = ReflectionUtils.newInstance(sinkClass); executor.shutdown();
}
int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
executor.shutdown();
System.exit(exitCode); System.exit(exitCode);
} }
} }

View File

@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
@ -114,11 +115,11 @@ public class TestCanaryTool {
ExecutorService executor = new ScheduledThreadPoolExecutor(1); ExecutorService executor = new ScheduledThreadPoolExecutor(1);
Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink()); Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
Canary canary = new Canary(executor, sink); Canary canary = new Canary(executor, sink);
String[] args = { "-writeSniffing", "-t", "10000", name.getMethodName() }; String[] args = { "-writeSniffing", "-t", "10000", tableName.getNameAsString() };
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
assertEquals("verify no read error count", 0, canary.getReadFailures().size()); assertEquals("verify no read error count", 0, canary.getReadFailures().size());
assertEquals("verify no write error count", 0, canary.getWriteFailures().size()); assertEquals("verify no write error count", 0, canary.getWriteFailures().size());
verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class), verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(RegionInfo.class),
isA(ColumnFamilyDescriptor.class), anyLong()); isA(ColumnFamilyDescriptor.class), anyLong());
} }
@ -144,7 +145,8 @@ public class TestCanaryTool {
Canary canary = new Canary(executor, sink); Canary canary = new Canary(executor, sink);
String configuredTimeoutStr = tableNames[0].getNameAsString() + "=" + Long.MAX_VALUE + "," + String configuredTimeoutStr = tableNames[0].getNameAsString() + "=" + Long.MAX_VALUE + "," +
tableNames[1].getNameAsString() + "=0"; tableNames[1].getNameAsString() + "=0";
String[] args = { "-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1", name.getMethodName() + "2"}; String[] args = {"-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1",
name.getMethodName() + "2"};
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
verify(sink, times(tableNames.length)).initializeAndGetReadLatencyForTable(isA(String.class)); verify(sink, times(tableNames.length)).initializeAndGetReadLatencyForTable(isA(String.class));
for (int i=0; i<2; i++) { for (int i=0; i<2; i++) {
@ -231,7 +233,7 @@ public class TestCanaryTool {
conf.setBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, true); conf.setBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, true);
assertEquals(0, ToolRunner.run(conf, canary, args)); assertEquals(0, ToolRunner.run(conf, canary, args));
verify(sink, atLeastOnce()) verify(sink, atLeastOnce())
.publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class), .publishReadTiming(isA(ServerName.class), isA(RegionInfo.class),
isA(ColumnFamilyDescriptor.class), anyLong()); isA(ColumnFamilyDescriptor.class), anyLong());
assertEquals("verify no read error count", 0, canary.getReadFailures().size()); assertEquals("verify no read error count", 0, canary.getReadFailures().size());
} }

View File

@ -82,45 +82,54 @@ Others, such as `hbase shell` (<<shell>>), `hbase upgrade` (<<upgrading>>), and
=== Canary === Canary
There is a Canary class can help users to canary-test the HBase cluster status, with every column-family for every regions or RegionServer's granularity. The Canary tool can help users "canary-test" the HBase cluster status.
To see the usage, use the `-help` parameter. The default "region mode" fetches a row from every column-family of every regions.
In "regionserver mode", the Canary tool will fetch a row from a random
region on each of the cluster's RegionServers. In "zookeeper mode", the
Canary will read the root znode on each member of the zookeeper ensemble.
To see usage, pass the `-help` parameter (if you pass no
parameters, the Canary tool starts executing in the default
region "mode" fetching a row from every region in the cluster).
---- ----
$ ${HBASE_HOME}/bin/hbase canary -help 2018-10-16 13:11:27,037 INFO [main] tool.Canary: Execution thread count=16
Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]
Where [OPTIONS] are:
-h,-help show this help and exit.
-regionserver set 'regionserver mode'; gets row from random region on server
-allRegions get from ALL regions when 'regionserver mode', not just random one.
-zookeeper set 'zookeeper mode'; grab zookeeper.znode.parent on each ensemble member
-daemon continuous check at defined intervals.
-interval <N> interval between checks in seconds
-e consider table/regionserver argument as regular expression
-f <B> exit on first error; default=true
-failureAsError treat read/write failure as error
-t <N> timeout for canary-test run; default=600000ms
-writeSniffing enable write sniffing
-writeTable the table used for write sniffing; default=hbase:canary
-writeTableTimeout <N> timeout for writeTable; default=600000ms
-readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>,...
comma-separated list of table read timeouts (no spaces);
logs 'ERROR' if takes longer. default=600000ms
-permittedZookeeperFailures <N> Ignore first N failures attempting to
connect to individual zookeeper nodes in ensemble
Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..] -D<configProperty>=<value> to assign or override configuration params
where [opts] are: -Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable raw scan; default=false
-help Show this help and exit.
-regionserver replace the table argument to regionserver, Canary runs in one of three modes: region (default), regionserver, or zookeeper.
which means to enable regionserver mode To sniff/probe all regions, pass no arguments.
-allRegions Tries all regions on a regionserver, To sniff/probe all regions of a table, pass tablename.
only works in regionserver mode. To sniff/probe regionservers, pass -regionserver, etc.
-zookeeper Tries to grab zookeeper.znode.parent See http://hbase.apache.org/book.html#_canary for Canary documentation.
on each zookeeper instance
-daemon Continuous check at defined intervals.
-interval <N> Interval between checks (sec)
-e Use table/regionserver as regular expression
which means the table/regionserver is regular expression pattern
-f <B> stop whole program if first error occurs, default is true
-t <N> timeout for a check, default is 600000 (millisecs)
-writeTableTimeout <N> write timeout for the writeTable, default is 600000 (millisecs)
-readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>, ... comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)
-writeSniffing enable the write sniffing in canary
-treatFailureAsError treats read / write failure as error
-writeTable The table used for write sniffing. Default is hbase:canary
-Dhbase.canary.read.raw.enabled=<true/false> Use this flag to enable or disable raw scan during read canary test Default is false and raw is not enabled during scan
-D<configProperty>=<value> assigning or override the configuration params
---- ----
[NOTE] [NOTE]
The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property which The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property.
will also determine the used Monitor class. In the absence of this property RegionServerStdOutSink
will be used. You need to use the Sink according to the passed parameters to the _canary_ command.
As an example you have to set `hbase.canary.sink.class` property to
`org.apache.hadoop.hbase.tool.Canary$RegionStdOutSink` for using table parameters.
This tool will return non zero error codes to user for collaborating with other monitoring tools, such as Nagios. This tool will return non zero error codes to user for collaborating with other monitoring tools,
The error code definitions are: such as Nagios. The error code definitions are:
[source,java] [source,java]
---- ----
@ -131,9 +140,9 @@ private static final int ERROR_EXIT_CODE = 4;
private static final int FAILURE_EXIT_CODE = 5; private static final int FAILURE_EXIT_CODE = 5;
---- ----
Here are some examples based on the following given case. Here are some examples based on the following given case: given two Table objects called test-01
There are two Table objects called test-01 and test-02, they have two column family cf1 and cf2 respectively, and deployed on the 3 RegionServers. and test-02 each with two column family cf1 and cf2 respectively, deployed on 3 RegionServers.
see following table. See the following table.
[cols="1,1,1", options="header"] [cols="1,1,1", options="header"]
|=== |===
@ -145,7 +154,7 @@ see following table.
| rs3 | r2 | r1 | rs3 | r2 | r1
|=== |===
Following are some examples based on the previous given case. Following are some example outputs based on the previous given case.
==== Canary test for every column family (store) of every region of every table ==== Canary test for every column family (store) of every region of every table
@ -163,12 +172,13 @@ $ ${HBASE_HOME}/bin/hbase canary
13/12/09 03:26:32 INFO tool.Canary: read from region test-02,0004883,1386559511167.cbda32d5e2e276520712d84eaaa29d84. column family cf2 in 8ms 13/12/09 03:26:32 INFO tool.Canary: read from region test-02,0004883,1386559511167.cbda32d5e2e276520712d84eaaa29d84. column family cf2 in 8ms
---- ----
So you can see, table test-01 has two regions and two column families, so the Canary tool will pick 4 small piece of data from 4 (2 region * 2 store) different stores. So you can see, table test-01 has two regions and two column families, so the Canary tool in the
This is a default behavior of the this tool does. default "region mode" will pick 4 small piece of data from 4 (2 region * 2 store) different stores.
This is a default behavior.
==== Canary test for every column family (store) of every region of specific table(s) ==== Canary test for every column family (store) of every region of a specific table(s)
You can also test one or more specific tables. You can also test one or more specific tables by passing table names.
---- ----
$ ${HBASE_HOME}/bin/hbase canary test-01 test-02 $ ${HBASE_HOME}/bin/hbase canary test-01 test-02
@ -176,7 +186,9 @@ $ ${HBASE_HOME}/bin/hbase canary test-01 test-02
==== Canary test with RegionServer granularity ==== Canary test with RegionServer granularity
This will pick one small piece of data from each RegionServer, and can also put your RegionServer name as input options for canary-test specific RegionServer. In "regionserver mode", the Canary tool will pick one small piece of data
from each RegionServer (You can also pass one or more RegionServer names as arguments
to the canary-test when in "regionserver mode").
---- ----
$ ${HBASE_HOME}/bin/hbase canary -regionserver $ ${HBASE_HOME}/bin/hbase canary -regionserver
@ -188,22 +200,25 @@ $ ${HBASE_HOME}/bin/hbase canary -regionserver
==== Canary test with regular expression pattern ==== Canary test with regular expression pattern
This will test both table test-01 and test-02. You can pass regexes for table names when in "region mode" or for servernames when
in "regionserver mode". The below will test both table test-01 and test-02.
---- ----
$ ${HBASE_HOME}/bin/hbase canary -e test-0[1-2] $ ${HBASE_HOME}/bin/hbase canary -e test-0[1-2]
---- ----
==== Run canary test as daemon mode ==== Run canary test as a "daemon"
Run repeatedly with interval defined in option `-interval` whose default value is 60 seconds. Run repeatedly with an interval defined via the option `-interval` (default value is 60 seconds).
This daemon will stop itself and return non-zero error code if any error occurs, due to the default value of option -f is true. This daemon will stop itself and return non-zero error code if any error occur. To have
the daemon keep running across errors, pass the -f flag with its value set to false
(see usage above).
---- ----
$ ${HBASE_HOME}/bin/hbase canary -daemon $ ${HBASE_HOME}/bin/hbase canary -daemon
---- ----
Run repeatedly with 5 second intervals and will not stop itself even if errors occur in the test. To run repeatedly with 5 second intervals and not stop on errors, do the following.
---- ----
$ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false $ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false
@ -211,9 +226,11 @@ $ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false
==== Force timeout if canary test stuck ==== Force timeout if canary test stuck
In some cases the request is stuck and no response is sent back to the client. This can happen with dead RegionServers which the master has not yet noticed. In some cases the request is stuck and no response is sent back to the client. This
Because of this we provide a timeout option to kill the canary test and return a non-zero error code. can happen with dead RegionServers which the master has not yet noticed.
This run sets the timeout value to 60 seconds, the default value is 600 seconds. Because of this we provide a timeout option to kill the canary test and return a
non-zero error code. The below sets the timeout value to 60 seconds (the default value
is 600 seconds).
---- ----
$ ${HBASE_HOME}/bin/hbase canary -t 60000 $ ${HBASE_HOME}/bin/hbase canary -t 60000
@ -221,36 +238,37 @@ $ ${HBASE_HOME}/bin/hbase canary -t 60000
==== Enable write sniffing in canary ==== Enable write sniffing in canary
By default, the canary tool only check the read operations, it's hard to find the problem in the By default, the canary tool only checks read operations. To enable the write sniffing,
write path. To enable the write sniffing, you can run canary with the `-writeSniffing` option. you can run the canary with the `-writeSniffing` option set. When write sniffing is
When the write sniffing is enabled, the canary tool will create an hbase table and make sure the enabled, the canary tool will create an hbase table and make sure the
regions of the table distributed on all region servers. In each sniffing period, the canary will regions of the table are distributed to all region servers. In each sniffing period,
try to put data to these regions to check the write availability of each region server. the canary will try to put data to these regions to check the write availability of
each region server.
---- ----
$ ${HBASE_HOME}/bin/hbase canary -writeSniffing $ ${HBASE_HOME}/bin/hbase canary -writeSniffing
---- ----
The default write table is `hbase:canary` and can be specified by the option `-writeTable`. The default write table is `hbase:canary` and can be specified with the option `-writeTable`.
---- ----
$ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary $ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary
---- ----
The default value size of each put is 10 bytes and you can set it by the config key: The default value size of each put is 10 bytes. You can set it via the config key:
`hbase.canary.write.value.size`. `hbase.canary.write.value.size`.
==== Treat read / write failure as error ==== Treat read / write failure as error
By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException, By default, the canary tool only logs read failures -- due to e.g. RetriesExhaustedException, etc. --
while returning normal exit code. To treat read / write failure as error, you can run canary and will return the 'normal' exit code. To treat read/write failure as errors, you can run canary
with the `-treatFailureAsError` option. When enabled, read / write failure would result in error with the `-treatFailureAsError` option. When enabled, read/write failures will result in an
exit code. error exit code.
---- ----
$ ${HBASE_HOME}/bin/hbase canary -treatFailureAsError $ ${HBASE_HOME}/bin/hbase canary -treatFailureAsError
---- ----
==== Running Canary in a Kerberos-enabled Cluster ==== Running Canary in a Kerberos-enabled Cluster
To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_: To run the Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:
* `hbase.client.keytab.file` * `hbase.client.keytab.file`
* `hbase.client.kerberos.principal` * `hbase.client.kerberos.principal`