HBASE-21320 [canary] Cleanup of usage and add commentary
Signed-off-by: Peter Somogyi <psomogyi@cloudera.com>
This commit is contained in:
parent
b0846fb762
commit
999a3c67d4
|
@ -1362,6 +1362,7 @@ public final class HConstants {
|
|||
"hbase.regionserver.region.split.threads.max";
|
||||
|
||||
/** Canary config keys */
|
||||
// TODO: Move these defines to Canary Class
|
||||
public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl";
|
||||
|
||||
public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY =
|
||||
|
|
|
@ -90,6 +90,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKConfig;
|
|||
import org.apache.hadoop.util.GenericOptionsParser;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.ZooKeeper;
|
||||
|
@ -101,39 +102,45 @@ import org.slf4j.LoggerFactory;
|
|||
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* HBase Canary Tool, that that can be used to do
|
||||
* "canary monitoring" of a running HBase cluster.
|
||||
* HBase Canary Tool for "canary monitoring" of a running HBase cluster.
|
||||
*
|
||||
* Here are three modes
|
||||
* 1. region mode - Foreach region tries to get one row per column family
|
||||
* and outputs some information about failure or latency.
|
||||
* There are three modes:
|
||||
* <ol>
|
||||
* <li>region mode (Default): For each region, try to get one row per column family outputting
|
||||
* information on failure (ERROR) or else the latency.
|
||||
* </li>
|
||||
*
|
||||
* 2. regionserver mode - Foreach regionserver tries to get one row from one table
|
||||
* selected randomly and outputs some information about failure or latency.
|
||||
* <li>regionserver mode: For each regionserver try to get one row from one table selected
|
||||
* randomly outputting information on failure (ERROR) or else the latency.
|
||||
* </li>
|
||||
*
|
||||
* 3. zookeeper mode - for each zookeeper instance, selects a zNode and
|
||||
* outputs some information about failure or latency.
|
||||
* <li>zookeeper mode: for each zookeeper instance, selects a znode outputting information on
|
||||
* failure (ERROR) or else the latency.
|
||||
* </li>
|
||||
* </ol>
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public final class Canary implements Tool {
|
||||
// Sink interface used by the canary to outputs information
|
||||
/**
|
||||
* Sink interface used by the canary to output information
|
||||
*/
|
||||
public interface Sink {
|
||||
public long getReadFailureCount();
|
||||
public long incReadFailureCount();
|
||||
public Map<String,String> getReadFailures();
|
||||
public void updateReadFailures(String regionName, String serverName);
|
||||
public long getWriteFailureCount();
|
||||
public long incWriteFailureCount();
|
||||
public Map<String,String> getWriteFailures();
|
||||
public void updateWriteFailures(String regionName, String serverName);
|
||||
long getReadFailureCount();
|
||||
long incReadFailureCount();
|
||||
Map<String,String> getReadFailures();
|
||||
void updateReadFailures(String regionName, String serverName);
|
||||
long getWriteFailureCount();
|
||||
long incWriteFailureCount();
|
||||
Map<String,String> getWriteFailures();
|
||||
void updateWriteFailures(String regionName, String serverName);
|
||||
}
|
||||
|
||||
// Simple implementation of canary sink that allows to plot on
|
||||
// file or standard output timings or failures.
|
||||
/**
|
||||
* Simple implementation of canary sink that allows plotting to a file or standard output.
|
||||
*/
|
||||
public static class StdOutSink implements Sink {
|
||||
private AtomicLong readFailureCount = new AtomicLong(0),
|
||||
writeFailureCount = new AtomicLong(0);
|
||||
|
||||
private Map<String, String> readFailures = new ConcurrentHashMap<>();
|
||||
private Map<String, String> writeFailures = new ConcurrentHashMap<>();
|
||||
|
||||
|
@ -178,67 +185,75 @@ public final class Canary implements Tool {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* By RegionServer, for 'regionserver' mode.
|
||||
*/
|
||||
public static class RegionServerStdOutSink extends StdOutSink {
|
||||
|
||||
public void publishReadFailure(String table, String server) {
|
||||
incReadFailureCount();
|
||||
LOG.error(String.format("Read from table:%s on region server:%s", table, server));
|
||||
LOG.error("Read from {} on {}", table, server);
|
||||
}
|
||||
|
||||
public void publishReadTiming(String table, String server, long msTime) {
|
||||
LOG.info(String.format("Read from table:%s on region server:%s in %dms",
|
||||
table, server, msTime));
|
||||
LOG.info("Read from {} on {} in {}ms", table, server, msTime);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Output for 'zookeeper' mode.
|
||||
*/
|
||||
public static class ZookeeperStdOutSink extends StdOutSink {
|
||||
|
||||
public void publishReadFailure(String zNode, String server) {
|
||||
public void publishReadFailure(String znode, String server) {
|
||||
incReadFailureCount();
|
||||
LOG.error(String.format("Read from zNode:%s on zookeeper instance:%s", zNode, server));
|
||||
LOG.error("Read from {} on {}", znode, server);
|
||||
}
|
||||
|
||||
public void publishReadTiming(String znode, String server, long msTime) {
|
||||
LOG.info(String.format("Read from zNode:%s on zookeeper instance:%s in %dms",
|
||||
znode, server, msTime));
|
||||
LOG.info("Read from {} on {} in {}ms", znode, server, msTime);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* By Region, for 'region' mode.
|
||||
*/
|
||||
public static class RegionStdOutSink extends StdOutSink {
|
||||
|
||||
private Map<String, LongAdder> perTableReadLatency = new HashMap<>();
|
||||
private LongAdder writeLatency = new LongAdder();
|
||||
|
||||
public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
|
||||
incReadFailureCount();
|
||||
LOG.error(String.format("read from region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
|
||||
LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
|
||||
}
|
||||
|
||||
public void publishReadFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) {
|
||||
public void publishReadFailure(ServerName serverName, RegionInfo region,
|
||||
ColumnFamilyDescriptor column, Exception e) {
|
||||
incReadFailureCount();
|
||||
LOG.error(String.format("read from region %s on regionserver %s column family %s failed",
|
||||
region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
|
||||
LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
|
||||
column.getNameAsString(), e);
|
||||
}
|
||||
|
||||
public void publishReadTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) {
|
||||
LOG.info(String.format("read from region %s on regionserver %s column family %s in %dms",
|
||||
region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
|
||||
public void publishReadTiming(ServerName serverName, RegionInfo region,
|
||||
ColumnFamilyDescriptor column, long msTime) {
|
||||
LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName,
|
||||
column.getNameAsString(), msTime);
|
||||
}
|
||||
|
||||
public void publishWriteFailure(ServerName serverName, RegionInfo region, Exception e) {
|
||||
incWriteFailureCount();
|
||||
LOG.error(String.format("write to region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
|
||||
LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e);
|
||||
}
|
||||
|
||||
public void publishWriteFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) {
|
||||
public void publishWriteFailure(ServerName serverName, RegionInfo region,
|
||||
ColumnFamilyDescriptor column, Exception e) {
|
||||
incWriteFailureCount();
|
||||
LOG.error(String.format("write to region %s on regionserver %s column family %s failed",
|
||||
region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
|
||||
LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName,
|
||||
column.getNameAsString(), e);
|
||||
}
|
||||
|
||||
public void publishWriteTiming(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, long msTime) {
|
||||
LOG.info(String.format("write to region %s on regionserver %s column family %s in %dms",
|
||||
region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
|
||||
public void publishWriteTiming(ServerName serverName, RegionInfo region,
|
||||
ColumnFamilyDescriptor column, long msTime) {
|
||||
LOG.info("Write to {} on {} {} in {}ms",
|
||||
region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime);
|
||||
}
|
||||
|
||||
public Map<String, LongAdder> getReadLatencyMap() {
|
||||
|
@ -260,6 +275,9 @@ public final class Canary implements Tool {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single zookeeper Task and then exit.
|
||||
*/
|
||||
static class ZookeeperTask implements Callable<Void> {
|
||||
private final Connection connection;
|
||||
private final String host;
|
||||
|
@ -298,8 +316,8 @@ public final class Canary implements Tool {
|
|||
}
|
||||
|
||||
/**
|
||||
* For each column family of the region tries to get one row and outputs the latency, or the
|
||||
* failure.
|
||||
* Run a single Region Task and then exit. For each column family of the Region, get one row and
|
||||
* output latency or failure.
|
||||
*/
|
||||
static class RegionTask implements Callable<Void> {
|
||||
public enum TaskType{
|
||||
|
@ -313,8 +331,8 @@ public final class Canary implements Tool {
|
|||
private ServerName serverName;
|
||||
private LongAdder readWriteLatency;
|
||||
|
||||
RegionTask(Connection connection, RegionInfo region, ServerName serverName, RegionStdOutSink sink,
|
||||
TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) {
|
||||
RegionTask(Connection connection, RegionInfo region, ServerName serverName,
|
||||
RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, LongAdder rwLatency) {
|
||||
this.connection = connection;
|
||||
this.region = region;
|
||||
this.serverName = serverName;
|
||||
|
@ -340,14 +358,11 @@ public final class Canary implements Tool {
|
|||
Table table = null;
|
||||
TableDescriptor tableDesc = null;
|
||||
try {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading table descriptor for table %s",
|
||||
region.getTable()));
|
||||
}
|
||||
LOG.debug("Reading table descriptor for table {}", region.getTable());
|
||||
table = connection.getTable(region.getTable());
|
||||
tableDesc = table.getDescriptor();
|
||||
} catch (IOException e) {
|
||||
LOG.debug("sniffRegion failed", e);
|
||||
LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e);
|
||||
sink.publishReadFailure(serverName, region, e);
|
||||
if (table != null) {
|
||||
try {
|
||||
|
@ -375,10 +390,7 @@ public final class Canary implements Tool {
|
|||
get.addFamily(column.getName());
|
||||
} else {
|
||||
scan = new Scan();
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("rawScan : %s for table: %s", rawScanEnabled,
|
||||
tableDesc.getTableName()));
|
||||
}
|
||||
LOG.debug("rawScan {} for {}", rawScanEnabled, tableDesc.getTableName());
|
||||
scan.setRaw(rawScanEnabled);
|
||||
scan.setCaching(1);
|
||||
scan.setCacheBlocks(false);
|
||||
|
@ -387,12 +399,9 @@ public final class Canary implements Tool {
|
|||
scan.setMaxResultSize(1L);
|
||||
scan.setOneRowLimit();
|
||||
}
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading from table %s region %s column family %s and key %s",
|
||||
tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
|
||||
Bytes.toStringBinary(startKey)));
|
||||
}
|
||||
LOG.debug("Reading from {} {} {} {}", tableDesc.getTableName(),
|
||||
region.getRegionNameAsString(), column.getNameAsString(),
|
||||
Bytes.toStringBinary(startKey));
|
||||
try {
|
||||
stopWatch.start();
|
||||
if (startKey.length > 0) {
|
||||
|
@ -425,7 +434,6 @@ public final class Canary implements Tool {
|
|||
|
||||
/**
|
||||
* Check writes for the canary table
|
||||
* @return
|
||||
*/
|
||||
private Void write() {
|
||||
Table table = null;
|
||||
|
@ -445,11 +453,9 @@ public final class Canary implements Tool {
|
|||
Bytes.random(value);
|
||||
put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("writing to table %s region %s column family %s and key %s",
|
||||
tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
|
||||
Bytes.toStringBinary(rowToCheck)));
|
||||
}
|
||||
LOG.debug("Writing to {} {} {} {}",
|
||||
tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
|
||||
Bytes.toStringBinary(rowToCheck));
|
||||
try {
|
||||
long startTime = System.currentTimeMillis();
|
||||
table.put(put);
|
||||
|
@ -470,7 +476,8 @@ public final class Canary implements Tool {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get one row from a region on the regionserver and outputs the latency, or the failure.
|
||||
* Run a single RegionServer Task and then exit.
|
||||
* Get one row from a region on the regionserver and output latency or the failure.
|
||||
*/
|
||||
static class RegionServerTask implements Callable<Void> {
|
||||
private Connection connection;
|
||||
|
@ -503,11 +510,9 @@ public final class Canary implements Tool {
|
|||
table = connection.getTable(tableName);
|
||||
startKey = region.getStartKey();
|
||||
// Can't do a get on empty start row so do a Scan of first element if any instead.
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading from region server %s table %s region %s and key %s",
|
||||
serverName, region.getTable(), region.getRegionNameAsString(),
|
||||
Bytes.toStringBinary(startKey)));
|
||||
}
|
||||
LOG.debug("Reading from {} {} {} {}",
|
||||
serverName, region.getTable(), region.getRegionNameAsString(),
|
||||
Bytes.toStringBinary(startKey));
|
||||
if (startKey.length > 0) {
|
||||
get = new Get(startKey);
|
||||
get.setCacheBlocks(false);
|
||||
|
@ -584,23 +589,43 @@ public final class Canary implements Tool {
|
|||
private boolean useRegExp;
|
||||
private long timeout = DEFAULT_TIMEOUT;
|
||||
private boolean failOnError = true;
|
||||
|
||||
/**
|
||||
* True if we are to run in 'regionServer' mode.
|
||||
*/
|
||||
private boolean regionServerMode = false;
|
||||
|
||||
/**
|
||||
* True if we are to run in zookeeper 'mode'.
|
||||
*/
|
||||
private boolean zookeeperMode = false;
|
||||
|
||||
private long permittedFailures = 0;
|
||||
private boolean regionServerAllRegions = false;
|
||||
private boolean writeSniffing = false;
|
||||
private long configuredWriteTableTimeout = DEFAULT_TIMEOUT;
|
||||
private boolean treatFailureAsError = false;
|
||||
private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
|
||||
|
||||
/**
|
||||
* This is a Map of table to timeout. The timeout is for reading all regions in the table; i.e.
|
||||
* we aggregate time to fetch each region and it needs to be less than this value else we
|
||||
* log an ERROR.
|
||||
*/
|
||||
private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>();
|
||||
|
||||
private ExecutorService executor; // threads to retrieve data from regionservers
|
||||
|
||||
public Canary() {
|
||||
this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink());
|
||||
this(new ScheduledThreadPoolExecutor(1));
|
||||
}
|
||||
|
||||
public Canary(ExecutorService executor, Sink sink) {
|
||||
public Canary(ExecutorService executor) {
|
||||
this(executor, null);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
Canary(ExecutorService executor, Sink sink) {
|
||||
this.executor = executor;
|
||||
this.sink = sink;
|
||||
}
|
||||
|
@ -628,7 +653,7 @@ public final class Canary implements Tool {
|
|||
printUsageAndExit();
|
||||
}
|
||||
|
||||
if (cmd.equals("-help")) {
|
||||
if (cmd.equals("-help") || cmd.equals("-h")) {
|
||||
// user asked for help, print the help and quit.
|
||||
printUsageAndExit();
|
||||
} else if (cmd.equals("-daemon") && interval == 0) {
|
||||
|
@ -639,7 +664,7 @@ public final class Canary implements Tool {
|
|||
i++;
|
||||
|
||||
if (i == args.length) {
|
||||
System.err.println("-interval needs a numeric value argument.");
|
||||
System.err.println("-interval takes a numeric seconds value argument.");
|
||||
printUsageAndExit();
|
||||
}
|
||||
|
||||
|
@ -657,7 +682,7 @@ public final class Canary implements Tool {
|
|||
this.regionServerAllRegions = true;
|
||||
} else if(cmd.equals("-writeSniffing")) {
|
||||
this.writeSniffing = true;
|
||||
} else if(cmd.equals("-treatFailureAsError")) {
|
||||
} else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) {
|
||||
this.treatFailureAsError = true;
|
||||
} else if (cmd.equals("-e")) {
|
||||
this.useRegExp = true;
|
||||
|
@ -665,35 +690,35 @@ public final class Canary implements Tool {
|
|||
i++;
|
||||
|
||||
if (i == args.length) {
|
||||
System.err.println("-t needs a numeric value argument.");
|
||||
System.err.println("-t takes a numeric milliseconds value argument.");
|
||||
printUsageAndExit();
|
||||
}
|
||||
|
||||
try {
|
||||
this.timeout = Long.parseLong(args[i]);
|
||||
} catch (NumberFormatException e) {
|
||||
System.err.println("-t needs a numeric value argument.");
|
||||
System.err.println("-t takes a numeric milliseconds value argument.");
|
||||
printUsageAndExit();
|
||||
}
|
||||
} else if(cmd.equals("-writeTableTimeout")) {
|
||||
i++;
|
||||
|
||||
if (i == args.length) {
|
||||
System.err.println("-writeTableTimeout needs a numeric value argument.");
|
||||
System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
|
||||
printUsageAndExit();
|
||||
}
|
||||
|
||||
try {
|
||||
this.configuredWriteTableTimeout = Long.parseLong(args[i]);
|
||||
} catch (NumberFormatException e) {
|
||||
System.err.println("-writeTableTimeout needs a numeric value argument.");
|
||||
System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
|
||||
printUsageAndExit();
|
||||
}
|
||||
} else if (cmd.equals("-writeTable")) {
|
||||
i++;
|
||||
|
||||
if (i == args.length) {
|
||||
System.err.println("-writeTable needs a string value argument.");
|
||||
System.err.println("-writeTable takes a string tablename value argument.");
|
||||
printUsageAndExit();
|
||||
}
|
||||
this.writeTableName = TableName.valueOf(args[i]);
|
||||
|
@ -711,14 +736,16 @@ public final class Canary implements Tool {
|
|||
i++;
|
||||
|
||||
if (i == args.length) {
|
||||
System.err.println("-readTableTimeouts needs a comma-separated list of read timeouts per table (without spaces).");
|
||||
System.err.println("-readTableTimeouts needs a comma-separated list of read " +
|
||||
"millisecond timeouts per table (without spaces).");
|
||||
printUsageAndExit();
|
||||
}
|
||||
String [] tableTimeouts = args[i].split(",");
|
||||
for (String tT: tableTimeouts) {
|
||||
String [] nameTimeout = tT.split("=");
|
||||
if (nameTimeout.length < 2) {
|
||||
System.err.println("Each -readTableTimeouts argument must be of the form <tableName>=<read timeout>.");
|
||||
System.err.println("Each -readTableTimeouts argument must be of the form " +
|
||||
"<tableName>=<read timeout> (without spaces).");
|
||||
printUsageAndExit();
|
||||
}
|
||||
long timeoutVal = 0L;
|
||||
|
@ -856,41 +883,56 @@ public final class Canary implements Tool {
|
|||
|
||||
private void printUsageAndExit() {
|
||||
System.err.println(
|
||||
"Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]");
|
||||
System.err.println(" where [opts] are:");
|
||||
System.err.println(" -help Show this help and exit.");
|
||||
System.err.println(" -regionserver replace the table argument to regionserver,");
|
||||
System.err.println(" which means to enable regionserver mode");
|
||||
System.err.println(" -allRegions Tries all regions on a regionserver,");
|
||||
System.err.println(" only works in regionserver mode.");
|
||||
System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent ");
|
||||
System.err.println(" on each zookeeper instance");
|
||||
System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures when attempting to ");
|
||||
System.err.println(" connect to individual zookeeper nodes in the ensemble");
|
||||
System.err.println(" -daemon Continuous check at defined intervals.");
|
||||
System.err.println(" -interval <N> Interval between checks (sec)");
|
||||
System.err.println(" -e Use table/regionserver as regular expression");
|
||||
System.err.println(" which means the table/regionserver is regular expression pattern");
|
||||
System.err.println(" -f <B> stop whole program if first error occurs," +
|
||||
" default is true");
|
||||
System.err.println(" -t <N> timeout for a check, default is 600000 (millisecs)");
|
||||
System.err.println(" -writeTableTimeout <N> write timeout for the writeTable, default is 600000 (millisecs)");
|
||||
System.err.println(" -readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>, ... "
|
||||
+ "comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)");
|
||||
System.err.println(" -writeSniffing enable the write sniffing in canary");
|
||||
System.err.println(" -treatFailureAsError treats read / write failure as error");
|
||||
System.err.println(" -writeTable The table used for write sniffing."
|
||||
+ " Default is hbase:canary");
|
||||
System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Use this flag to enable or disable raw scan during read canary test"
|
||||
+ " Default is false and raw is not enabled during scan");
|
||||
System.err
|
||||
.println(" -D<configProperty>=<value> assigning or override the configuration params");
|
||||
"Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]");
|
||||
System.err.println("Where [OPTIONS] are:");
|
||||
System.err.println(" -h,-help show this help and exit.");
|
||||
System.err.println(" -regionserver set 'regionserver mode'; gets row from random region on " +
|
||||
"server");
|
||||
System.err.println(" -allRegions get from ALL regions when 'regionserver mode', not just " +
|
||||
"random one.");
|
||||
System.err.println(" -zookeeper set 'zookeeper mode'; grab zookeeper.znode.parent on " +
|
||||
"each ensemble member");
|
||||
System.err.println(" -daemon continuous check at defined intervals.");
|
||||
System.err.println(" -interval <N> interval between checks in seconds");
|
||||
System.err.println(" -e consider table/regionserver argument as regular " +
|
||||
"expression");
|
||||
System.err.println(" -f <B> exit on first error; default=true");
|
||||
System.err.println(" -failureAsError treat read/write failure as error");
|
||||
System.err.println(" -t <N> timeout for canary-test run; default=600000ms");
|
||||
System.err.println(" -writeSniffing enable write sniffing");
|
||||
System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary");
|
||||
System.err.println(" -writeTableTimeout <N> timeout for writeTable; default=600000ms");
|
||||
System.err.println(" -readTableTimeouts <tableName>=<read timeout>," +
|
||||
"<tableName>=<read timeout>,...");
|
||||
System.err.println(" comma-separated list of table read timeouts " +
|
||||
"(no spaces);");
|
||||
System.err.println(" logs 'ERROR' if takes longer. default=600000ms");
|
||||
System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures attempting to ");
|
||||
System.err.println(" connect to individual zookeeper nodes in ensemble");
|
||||
System.err.println("");
|
||||
System.err.println(" -D<configProperty>=<value> to assign or override configuration params");
|
||||
System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable " +
|
||||
"raw scan; default=false");
|
||||
System.err.println("");
|
||||
System.err.println("Canary runs in one of three modes: region (default), regionserver, or " +
|
||||
"zookeeper.");
|
||||
System.err.println("To sniff/probe all regions, pass no arguments.");
|
||||
System.err.println("To sniff/probe all regions of a table, pass tablename.");
|
||||
System.err.println("To sniff/probe regionservers, pass -regionserver, etc.");
|
||||
System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation.");
|
||||
System.exit(USAGE_EXIT_CODE);
|
||||
}
|
||||
|
||||
Sink getSink(Configuration configuration, Class clazz) {
|
||||
// In test context, this.sink might be set. Use it if non-null. For testing.
|
||||
return this.sink != null? this.sink:
|
||||
(Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class",
|
||||
clazz, Sink.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* A Factory method for {@link Monitor}.
|
||||
* Can be overridden by user.
|
||||
* Makes a RegionServerMonitor, or a ZooKeeperMonitor, or a RegionMonitor.
|
||||
* @param index a start index for monitor target
|
||||
* @param args args passed from user
|
||||
* @return a Monitor instance
|
||||
|
@ -899,37 +941,45 @@ public final class Canary implements Tool {
|
|||
Monitor monitor = null;
|
||||
String[] monitorTargets = null;
|
||||
|
||||
if(index >= 0) {
|
||||
if (index >= 0) {
|
||||
int length = args.length - index;
|
||||
monitorTargets = new String[length];
|
||||
System.arraycopy(args, index, monitorTargets, 0, length);
|
||||
}
|
||||
|
||||
if (this.sink instanceof RegionServerStdOutSink || this.regionServerMode) {
|
||||
if (this.regionServerMode) {
|
||||
monitor =
|
||||
new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
|
||||
(StdOutSink) this.sink, this.executor, this.regionServerAllRegions,
|
||||
getSink(connection.getConfiguration(), RegionServerStdOutSink.class),
|
||||
this.executor, this.regionServerAllRegions,
|
||||
this.treatFailureAsError, this.permittedFailures);
|
||||
} else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) {
|
||||
} else if (this.zookeeperMode) {
|
||||
monitor =
|
||||
new ZookeeperMonitor(connection, monitorTargets, this.useRegExp,
|
||||
(StdOutSink) this.sink, this.executor, this.treatFailureAsError,
|
||||
getSink(connection.getConfiguration(), ZookeeperStdOutSink.class),
|
||||
this.executor, this.treatFailureAsError,
|
||||
this.permittedFailures);
|
||||
} else {
|
||||
monitor =
|
||||
new RegionMonitor(connection, monitorTargets, this.useRegExp,
|
||||
(StdOutSink) this.sink, this.executor, this.writeSniffing,
|
||||
getSink(connection.getConfiguration(), RegionStdOutSink.class),
|
||||
this.executor, this.writeSniffing,
|
||||
this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts,
|
||||
this.configuredWriteTableTimeout, this.permittedFailures);
|
||||
}
|
||||
return monitor;
|
||||
}
|
||||
|
||||
// a Monitor super-class can be extended by users
|
||||
/**
|
||||
* A Monitor super-class can be extended by users
|
||||
*/
|
||||
public static abstract class Monitor implements Runnable, Closeable {
|
||||
|
||||
protected Connection connection;
|
||||
protected Admin admin;
|
||||
/**
|
||||
* 'Target' dependent on 'mode'. Could be Tables or RegionServers or ZNodes.
|
||||
* Passed on the command-line as arguments.
|
||||
*/
|
||||
protected String[] targets;
|
||||
protected boolean useRegExp;
|
||||
protected boolean treatFailureAsError;
|
||||
|
@ -999,7 +1049,9 @@ public final class Canary implements Tool {
|
|||
}
|
||||
}
|
||||
|
||||
// a monitor for region mode
|
||||
/**
|
||||
* A monitor for region mode.
|
||||
*/
|
||||
private static class RegionMonitor extends Monitor {
|
||||
// 10 minutes
|
||||
private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
|
||||
|
@ -1014,14 +1066,22 @@ public final class Canary implements Tool {
|
|||
private float regionsUpperLimit;
|
||||
private int checkPeriod;
|
||||
private boolean rawScanEnabled;
|
||||
|
||||
/**
|
||||
* This is a timeout per table. If read of each region in the table aggregated takes longer
|
||||
* than what is configured here, we log an ERROR rather than just an INFO.
|
||||
*/
|
||||
private HashMap<String, Long> configuredReadTableTimeouts;
|
||||
|
||||
private long configuredWriteTableTimeout;
|
||||
|
||||
public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
|
||||
StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
|
||||
boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts, long configuredWriteTableTimeout,
|
||||
Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
|
||||
boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts,
|
||||
long configuredWriteTableTimeout,
|
||||
long allowedFailures) {
|
||||
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
|
||||
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
|
||||
allowedFailures);
|
||||
Configuration conf = connection.getConfiguration();
|
||||
this.writeSniffing = writeSniffing;
|
||||
this.writeTableName = writeTableName;
|
||||
|
@ -1054,9 +1114,12 @@ public final class Canary implements Tool {
|
|||
RegionStdOutSink regionSink = this.getSink();
|
||||
if (this.targets != null && this.targets.length > 0) {
|
||||
String[] tables = generateMonitorTables(this.targets);
|
||||
// Check to see that each table name passed in the -readTableTimeouts argument is also passed as a monitor target.
|
||||
if (! new HashSet<>(Arrays.asList(tables)).containsAll(this.configuredReadTableTimeouts.keySet())) {
|
||||
LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets passed via command line.");
|
||||
// Check to see that each table name passed in the -readTableTimeouts argument is also
|
||||
// passed as a monitor target.
|
||||
if (!new HashSet<>(Arrays.asList(tables)).
|
||||
containsAll(this.configuredReadTableTimeouts.keySet())) {
|
||||
LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " +
|
||||
"passed via command line.");
|
||||
this.errorCode = USAGE_EXIT_CODE;
|
||||
return;
|
||||
}
|
||||
|
@ -1082,7 +1145,7 @@ public final class Canary implements Tool {
|
|||
// sniff canary table with write operation
|
||||
regionSink.initializeWriteLatency();
|
||||
LongAdder writeTableLatency = regionSink.getWriteLatency();
|
||||
taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName),
|
||||
taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getDescriptor(writeTableName),
|
||||
executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency));
|
||||
}
|
||||
|
||||
|
@ -1099,23 +1162,26 @@ public final class Canary implements Tool {
|
|||
if (actualReadTableLatency.containsKey(tableName)) {
|
||||
Long actual = actualReadTableLatency.get(tableName).longValue();
|
||||
Long configured = entry.getValue();
|
||||
LOG.info("Read operation for " + tableName + " took " + actual +
|
||||
" ms. The configured read timeout was " + configured + " ms.");
|
||||
if (actual > configured) {
|
||||
LOG.error("Read operation for " + tableName + " exceeded the configured read timeout.");
|
||||
LOG.error("Read operation for {} took {}ms (Configured read timeout {}ms.",
|
||||
tableName, actual, configured);
|
||||
} else {
|
||||
LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.",
|
||||
tableName, actual, configured);
|
||||
}
|
||||
} else {
|
||||
LOG.error("Read operation for " + tableName + " failed!");
|
||||
LOG.error("Read operation for {} failed!", tableName);
|
||||
}
|
||||
}
|
||||
if (this.writeSniffing) {
|
||||
String writeTableStringName = this.writeTableName.getNameAsString();
|
||||
long actualWriteLatency = regionSink.getWriteLatency().longValue();
|
||||
LOG.info("Write operation for " + writeTableStringName + " took " + actualWriteLatency + " ms. The configured write timeout was " +
|
||||
this.configuredWriteTableTimeout + " ms.");
|
||||
LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.",
|
||||
writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout);
|
||||
// Check that the writeTable write operation latency does not exceed the configured timeout.
|
||||
if (actualWriteLatency > this.configuredWriteTableTimeout) {
|
||||
LOG.error("Write operation for " + writeTableStringName + " exceeded the configured write timeout.");
|
||||
LOG.error("Write operation for {} exceeded the configured write timeout.",
|
||||
writeTableStringName);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
@ -1123,31 +1189,32 @@ public final class Canary implements Tool {
|
|||
this.errorCode = ERROR_EXIT_CODE;
|
||||
} finally {
|
||||
this.done = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.done = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return List of tables to use in test.
|
||||
*/
|
||||
private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
|
||||
String[] returnTables = null;
|
||||
|
||||
if (this.useRegExp) {
|
||||
Pattern pattern = null;
|
||||
HTableDescriptor[] tds = null;
|
||||
TableDescriptor[] tds = null;
|
||||
Set<String> tmpTables = new TreeSet<>();
|
||||
try {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading list of tables"));
|
||||
}
|
||||
LOG.debug(String.format("reading list of tables"));
|
||||
tds = this.admin.listTables(pattern);
|
||||
if (tds == null) {
|
||||
tds = new HTableDescriptor[0];
|
||||
tds = new TableDescriptor[0];
|
||||
}
|
||||
for (String monitorTarget : monitorTargets) {
|
||||
pattern = Pattern.compile(monitorTarget);
|
||||
for (HTableDescriptor td : tds) {
|
||||
if (pattern.matcher(td.getNameAsString()).matches()) {
|
||||
tmpTables.add(td.getNameAsString());
|
||||
for (TableDescriptor td : tds) {
|
||||
if (pattern.matcher(td.getTableName().getNameAsString()).matches()) {
|
||||
tmpTables.add(td.getTableName().getNameAsString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1172,18 +1239,19 @@ public final class Canary implements Tool {
|
|||
}
|
||||
|
||||
/*
|
||||
* canary entry point to monitor all the tables.
|
||||
* Canary entry point to monitor all the tables.
|
||||
*/
|
||||
private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink) throws Exception {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading list of tables"));
|
||||
}
|
||||
private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink)
|
||||
throws Exception {
|
||||
LOG.debug("Reading list of tables");
|
||||
List<Future<Void>> taskFutures = new LinkedList<>();
|
||||
for (HTableDescriptor table : admin.listTables()) {
|
||||
if (admin.isTableEnabled(table.getTableName())
|
||||
&& (!table.getTableName().equals(writeTableName))) {
|
||||
LongAdder readLatency = regionSink.initializeAndGetReadLatencyForTable(table.getNameAsString());
|
||||
taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType, this.rawScanEnabled, readLatency));
|
||||
for (TableDescriptor td: admin.listTableDescriptors()) {
|
||||
if (admin.isTableEnabled(td.getTableName()) &&
|
||||
(!td.getTableName().equals(writeTableName))) {
|
||||
LongAdder readLatency =
|
||||
regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString());
|
||||
taskFutures.addAll(Canary.sniff(admin, sink, td, executor, taskType, this.rawScanEnabled,
|
||||
readLatency));
|
||||
}
|
||||
}
|
||||
return taskFutures;
|
||||
|
@ -1231,11 +1299,10 @@ public final class Canary implements Tool {
|
|||
|
||||
private void createWriteTable(int numberOfServers) throws IOException {
|
||||
int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
|
||||
LOG.info("Number of live regionservers: " + numberOfServers + ", "
|
||||
+ "pre-splitting the canary table into " + numberOfRegions + " regions "
|
||||
+ "(current lower limit of regions per server is " + regionsLowerLimit
|
||||
+ " and you can change it by config: "
|
||||
+ HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
|
||||
LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " +
|
||||
"(current lower limit of regions per server is {} and you can change it with config {}).",
|
||||
numberOfServers, numberOfRegions, regionsLowerLimit,
|
||||
HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY);
|
||||
HTableDescriptor desc = new HTableDescriptor(writeTableName);
|
||||
HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
|
||||
family.setMaxVersions(1);
|
||||
|
@ -1252,59 +1319,40 @@ public final class Canary implements Tool {
|
|||
* @throws Exception
|
||||
*/
|
||||
private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
|
||||
ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency) throws Exception {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s",
|
||||
tableName));
|
||||
}
|
||||
ExecutorService executor, TaskType taskType, boolean rawScanEnabled, LongAdder readLatency)
|
||||
throws Exception {
|
||||
LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName);
|
||||
if (admin.isTableEnabled(TableName.valueOf(tableName))) {
|
||||
return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
|
||||
return Canary.sniff(admin, sink, admin.getDescriptor(TableName.valueOf(tableName)),
|
||||
executor, taskType, rawScanEnabled, readLatency);
|
||||
} else {
|
||||
LOG.warn(String.format("Table %s is not enabled", tableName));
|
||||
LOG.warn("Table {} is not enabled", tableName);
|
||||
}
|
||||
return new LinkedList<>();
|
||||
}
|
||||
|
||||
/*
|
||||
* Loops over regions that owns this table, and output some information about the state.
|
||||
* Loops over regions of this table, and outputs information about the state.
|
||||
*/
|
||||
private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
|
||||
HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
|
||||
TableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
|
||||
boolean rawScanEnabled, LongAdder rwLatency) throws Exception {
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading list of regions for table %s", tableDesc.getTableName()));
|
||||
}
|
||||
|
||||
Table table = null;
|
||||
try {
|
||||
table = admin.getConnection().getTable(tableDesc.getTableName());
|
||||
LOG.debug("Reading list of regions for table {}", tableDesc.getTableName());
|
||||
try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) {
|
||||
List<RegionTask> tasks = new ArrayList<>();
|
||||
try (RegionLocator regionLocator =
|
||||
admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
|
||||
for (HRegionLocation location: regionLocator.getAllRegionLocations()) {
|
||||
ServerName rs = location.getServerName();
|
||||
RegionInfo region = location.getRegion();
|
||||
tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink,
|
||||
taskType, rawScanEnabled, rwLatency));
|
||||
}
|
||||
return executor.invokeAll(tasks);
|
||||
}
|
||||
} catch (TableNotFoundException e) {
|
||||
return new ArrayList<>();
|
||||
return Collections.EMPTY_LIST;
|
||||
}
|
||||
finally {
|
||||
if (table !=null) {
|
||||
table.close();
|
||||
}
|
||||
}
|
||||
|
||||
List<RegionTask> tasks = new ArrayList<>();
|
||||
RegionLocator regionLocator = null;
|
||||
try {
|
||||
regionLocator = admin.getConnection().getRegionLocator(tableDesc.getTableName());
|
||||
for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
|
||||
ServerName rs = location.getServerName();
|
||||
RegionInfo region = location.getRegionInfo();
|
||||
tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink) sink, taskType, rawScanEnabled,
|
||||
rwLatency));
|
||||
}
|
||||
} finally {
|
||||
if (regionLocator != null) {
|
||||
regionLocator.close();
|
||||
}
|
||||
}
|
||||
return executor.invokeAll(tasks);
|
||||
}
|
||||
|
||||
// monitor for zookeeper mode
|
||||
|
@ -1314,8 +1362,9 @@ public final class Canary implements Tool {
|
|||
private final int timeout;
|
||||
|
||||
protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
|
||||
StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
|
||||
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
|
||||
Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
|
||||
super(connection, monitorTargets, useRegExp,
|
||||
sink, executor, treatFailureAsError, allowedFailures);
|
||||
Configuration configuration = connection.getConfiguration();
|
||||
znode =
|
||||
configuration.get(ZOOKEEPER_ZNODE_PARENT,
|
||||
|
@ -1374,15 +1423,17 @@ public final class Canary implements Tool {
|
|||
}
|
||||
|
||||
|
||||
// a monitor for regionserver mode
|
||||
/**
|
||||
* A monitor for regionserver mode
|
||||
*/
|
||||
private static class RegionServerMonitor extends Monitor {
|
||||
|
||||
private boolean allRegions;
|
||||
|
||||
public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
|
||||
StdOutSink sink, ExecutorService executor, boolean allRegions,
|
||||
Sink sink, ExecutorService executor, boolean allRegions,
|
||||
boolean treatFailureAsError, long allowedFailures) {
|
||||
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
|
||||
super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
|
||||
allowedFailures);
|
||||
this.allRegions = allRegions;
|
||||
}
|
||||
|
||||
|
@ -1413,10 +1464,7 @@ public final class Canary implements Tool {
|
|||
private boolean checkNoTableNames() {
|
||||
List<String> foundTableNames = new ArrayList<>();
|
||||
TableName[] tableNames = null;
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading list of tables"));
|
||||
}
|
||||
LOG.debug("Reading list of tables");
|
||||
try {
|
||||
tableNames = this.admin.listTableNames();
|
||||
} catch (IOException e) {
|
||||
|
@ -1452,7 +1500,7 @@ public final class Canary implements Tool {
|
|||
AtomicLong successes = new AtomicLong(0);
|
||||
successMap.put(serverName, successes);
|
||||
if (entry.getValue().isEmpty()) {
|
||||
LOG.error(String.format("Regionserver not serving any regions - %s", serverName));
|
||||
LOG.error("Regionserver not serving any regions - {}", serverName);
|
||||
} else if (this.allRegions) {
|
||||
for (RegionInfo region : entry.getValue()) {
|
||||
tasks.add(new RegionServerTask(this.connection,
|
||||
|
@ -1483,8 +1531,8 @@ public final class Canary implements Tool {
|
|||
if (this.allRegions) {
|
||||
for (Map.Entry<String, List<RegionInfo>> entry : rsAndRMap.entrySet()) {
|
||||
String serverName = entry.getKey();
|
||||
LOG.info("Successfully read " + successMap.get(serverName) + " regions out of "
|
||||
+ entry.getValue().size() + " on regionserver:" + serverName);
|
||||
LOG.info("Successfully read {} regions out of {} on regionserver {}",
|
||||
successMap.get(serverName), entry.getValue().size(), serverName);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -1501,36 +1549,30 @@ public final class Canary implements Tool {
|
|||
|
||||
private Map<String, List<RegionInfo>> getAllRegionServerByName() {
|
||||
Map<String, List<RegionInfo>> rsAndRMap = new HashMap<>();
|
||||
Table table = null;
|
||||
RegionLocator regionLocator = null;
|
||||
try {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("reading list of tables and locations"));
|
||||
}
|
||||
HTableDescriptor[] tableDescs = this.admin.listTables();
|
||||
LOG.debug("Reading list of tables and locations");
|
||||
List<TableDescriptor> tableDescs = this.admin.listTableDescriptors();
|
||||
List<RegionInfo> regions = null;
|
||||
for (HTableDescriptor tableDesc : tableDescs) {
|
||||
table = this.admin.getConnection().getTable(tableDesc.getTableName());
|
||||
regionLocator = this.admin.getConnection().getRegionLocator(tableDesc.getTableName());
|
||||
|
||||
for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
|
||||
ServerName rs = location.getServerName();
|
||||
String rsName = rs.getHostname();
|
||||
RegionInfo r = location.getRegionInfo();
|
||||
|
||||
if (rsAndRMap.containsKey(rsName)) {
|
||||
regions = rsAndRMap.get(rsName);
|
||||
} else {
|
||||
regions = new ArrayList<>();
|
||||
rsAndRMap.put(rsName, regions);
|
||||
for (TableDescriptor tableDesc: tableDescs) {
|
||||
try (RegionLocator regionLocator =
|
||||
this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
|
||||
for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
|
||||
ServerName rs = location.getServerName();
|
||||
String rsName = rs.getHostname();
|
||||
RegionInfo r = location.getRegion();
|
||||
if (rsAndRMap.containsKey(rsName)) {
|
||||
regions = rsAndRMap.get(rsName);
|
||||
} else {
|
||||
regions = new ArrayList<>();
|
||||
rsAndRMap.put(rsName, regions);
|
||||
}
|
||||
regions.add(r);
|
||||
}
|
||||
regions.add(r);
|
||||
}
|
||||
table.close();
|
||||
}
|
||||
|
||||
// get any live regionservers not serving any regions
|
||||
for (ServerName rs : this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS))
|
||||
for (ServerName rs: this.admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS))
|
||||
.getLiveServerMetrics().keySet()) {
|
||||
String rsName = rs.getHostname();
|
||||
if (!rsAndRMap.containsKey(rsName)) {
|
||||
|
@ -1538,19 +1580,9 @@ public final class Canary implements Tool {
|
|||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
String msg = "Get HTables info failed";
|
||||
LOG.error(msg, e);
|
||||
LOG.error("Get HTables info failed", e);
|
||||
this.errorCode = INIT_ERROR_EXIT_CODE;
|
||||
} finally {
|
||||
if (table != null) {
|
||||
try {
|
||||
table.close();
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Close table failed", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rsAndRMap;
|
||||
}
|
||||
|
||||
|
@ -1576,13 +1608,13 @@ public final class Canary implements Tool {
|
|||
}
|
||||
}
|
||||
if (!regExpFound) {
|
||||
LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName);
|
||||
LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName);
|
||||
}
|
||||
} else {
|
||||
if (fullRsAndRMap.containsKey(rsName)) {
|
||||
filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
|
||||
} else {
|
||||
LOG.info("No RegionServerInfo found, regionServerName:" + rsName);
|
||||
LOG.info("No RegionServerInfo found, regionServerName {}", rsName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1596,20 +1628,19 @@ public final class Canary implements Tool {
|
|||
public static void main(String[] args) throws Exception {
|
||||
final Configuration conf = HBaseConfiguration.create();
|
||||
|
||||
// loading the generic options to conf
|
||||
// Loading the generic options to conf
|
||||
new GenericOptionsParser(conf, args);
|
||||
|
||||
int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
|
||||
LOG.info("Number of execution threads " + numThreads);
|
||||
LOG.info("Execution thread count={}", numThreads);
|
||||
|
||||
int exitCode = 0;
|
||||
ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
|
||||
|
||||
Class<? extends Sink> sinkClass =
|
||||
conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class);
|
||||
Sink sink = ReflectionUtils.newInstance(sinkClass);
|
||||
|
||||
int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
|
||||
executor.shutdown();
|
||||
try {
|
||||
exitCode = ToolRunner.run(conf, new Canary(executor), args);
|
||||
} finally {
|
||||
executor.shutdown();
|
||||
}
|
||||
System.exit(exitCode);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.*;
|
|||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
@ -114,11 +115,11 @@ public class TestCanaryTool {
|
|||
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
|
||||
Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
|
||||
Canary canary = new Canary(executor, sink);
|
||||
String[] args = { "-writeSniffing", "-t", "10000", name.getMethodName() };
|
||||
String[] args = { "-writeSniffing", "-t", "10000", tableName.getNameAsString() };
|
||||
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
|
||||
assertEquals("verify no read error count", 0, canary.getReadFailures().size());
|
||||
assertEquals("verify no write error count", 0, canary.getWriteFailures().size());
|
||||
verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class),
|
||||
verify(sink, atLeastOnce()).publishReadTiming(isA(ServerName.class), isA(RegionInfo.class),
|
||||
isA(ColumnFamilyDescriptor.class), anyLong());
|
||||
}
|
||||
|
||||
|
@ -144,7 +145,8 @@ public class TestCanaryTool {
|
|||
Canary canary = new Canary(executor, sink);
|
||||
String configuredTimeoutStr = tableNames[0].getNameAsString() + "=" + Long.MAX_VALUE + "," +
|
||||
tableNames[1].getNameAsString() + "=0";
|
||||
String[] args = { "-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1", name.getMethodName() + "2"};
|
||||
String[] args = {"-readTableTimeouts", configuredTimeoutStr, name.getMethodName() + "1",
|
||||
name.getMethodName() + "2"};
|
||||
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
|
||||
verify(sink, times(tableNames.length)).initializeAndGetReadLatencyForTable(isA(String.class));
|
||||
for (int i=0; i<2; i++) {
|
||||
|
@ -231,7 +233,7 @@ public class TestCanaryTool {
|
|||
conf.setBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, true);
|
||||
assertEquals(0, ToolRunner.run(conf, canary, args));
|
||||
verify(sink, atLeastOnce())
|
||||
.publishReadTiming(isA(ServerName.class), isA(HRegionInfo.class),
|
||||
.publishReadTiming(isA(ServerName.class), isA(RegionInfo.class),
|
||||
isA(ColumnFamilyDescriptor.class), anyLong());
|
||||
assertEquals("verify no read error count", 0, canary.getReadFailures().size());
|
||||
}
|
||||
|
|
|
@ -82,45 +82,54 @@ Others, such as `hbase shell` (<<shell>>), `hbase upgrade` (<<upgrading>>), and
|
|||
|
||||
=== Canary
|
||||
|
||||
There is a Canary class can help users to canary-test the HBase cluster status, with every column-family for every regions or RegionServer's granularity.
|
||||
To see the usage, use the `-help` parameter.
|
||||
The Canary tool can help users "canary-test" the HBase cluster status.
|
||||
The default "region mode" fetches a row from every column-family of every regions.
|
||||
In "regionserver mode", the Canary tool will fetch a row from a random
|
||||
region on each of the cluster's RegionServers. In "zookeeper mode", the
|
||||
Canary will read the root znode on each member of the zookeeper ensemble.
|
||||
|
||||
To see usage, pass the `-help` parameter (if you pass no
|
||||
parameters, the Canary tool starts executing in the default
|
||||
region "mode" fetching a row from every region in the cluster).
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -help
|
||||
2018-10-16 13:11:27,037 INFO [main] tool.Canary: Execution thread count=16
|
||||
Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]
|
||||
Where [OPTIONS] are:
|
||||
-h,-help show this help and exit.
|
||||
-regionserver set 'regionserver mode'; gets row from random region on server
|
||||
-allRegions get from ALL regions when 'regionserver mode', not just random one.
|
||||
-zookeeper set 'zookeeper mode'; grab zookeeper.znode.parent on each ensemble member
|
||||
-daemon continuous check at defined intervals.
|
||||
-interval <N> interval between checks in seconds
|
||||
-e consider table/regionserver argument as regular expression
|
||||
-f <B> exit on first error; default=true
|
||||
-failureAsError treat read/write failure as error
|
||||
-t <N> timeout for canary-test run; default=600000ms
|
||||
-writeSniffing enable write sniffing
|
||||
-writeTable the table used for write sniffing; default=hbase:canary
|
||||
-writeTableTimeout <N> timeout for writeTable; default=600000ms
|
||||
-readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>,...
|
||||
comma-separated list of table read timeouts (no spaces);
|
||||
logs 'ERROR' if takes longer. default=600000ms
|
||||
-permittedZookeeperFailures <N> Ignore first N failures attempting to
|
||||
connect to individual zookeeper nodes in ensemble
|
||||
|
||||
Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]
|
||||
where [opts] are:
|
||||
-help Show this help and exit.
|
||||
-regionserver replace the table argument to regionserver,
|
||||
which means to enable regionserver mode
|
||||
-allRegions Tries all regions on a regionserver,
|
||||
only works in regionserver mode.
|
||||
-zookeeper Tries to grab zookeeper.znode.parent
|
||||
on each zookeeper instance
|
||||
-daemon Continuous check at defined intervals.
|
||||
-interval <N> Interval between checks (sec)
|
||||
-e Use table/regionserver as regular expression
|
||||
which means the table/regionserver is regular expression pattern
|
||||
-f <B> stop whole program if first error occurs, default is true
|
||||
-t <N> timeout for a check, default is 600000 (millisecs)
|
||||
-writeTableTimeout <N> write timeout for the writeTable, default is 600000 (millisecs)
|
||||
-readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>, ... comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)
|
||||
-writeSniffing enable the write sniffing in canary
|
||||
-treatFailureAsError treats read / write failure as error
|
||||
-writeTable The table used for write sniffing. Default is hbase:canary
|
||||
-Dhbase.canary.read.raw.enabled=<true/false> Use this flag to enable or disable raw scan during read canary test Default is false and raw is not enabled during scan
|
||||
-D<configProperty>=<value> assigning or override the configuration params
|
||||
-D<configProperty>=<value> to assign or override configuration params
|
||||
-Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable raw scan; default=false
|
||||
|
||||
Canary runs in one of three modes: region (default), regionserver, or zookeeper.
|
||||
To sniff/probe all regions, pass no arguments.
|
||||
To sniff/probe all regions of a table, pass tablename.
|
||||
To sniff/probe regionservers, pass -regionserver, etc.
|
||||
See http://hbase.apache.org/book.html#_canary for Canary documentation.
|
||||
----
|
||||
|
||||
[NOTE]
|
||||
The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property which
|
||||
will also determine the used Monitor class. In the absence of this property RegionServerStdOutSink
|
||||
will be used. You need to use the Sink according to the passed parameters to the _canary_ command.
|
||||
As an example you have to set `hbase.canary.sink.class` property to
|
||||
`org.apache.hadoop.hbase.tool.Canary$RegionStdOutSink` for using table parameters.
|
||||
The `Sink` class is instantiated using the `hbase.canary.sink.class` configuration property.
|
||||
|
||||
This tool will return non zero error codes to user for collaborating with other monitoring tools, such as Nagios.
|
||||
The error code definitions are:
|
||||
This tool will return non zero error codes to user for collaborating with other monitoring tools,
|
||||
such as Nagios. The error code definitions are:
|
||||
|
||||
[source,java]
|
||||
----
|
||||
|
@ -131,9 +140,9 @@ private static final int ERROR_EXIT_CODE = 4;
|
|||
private static final int FAILURE_EXIT_CODE = 5;
|
||||
----
|
||||
|
||||
Here are some examples based on the following given case.
|
||||
There are two Table objects called test-01 and test-02, they have two column family cf1 and cf2 respectively, and deployed on the 3 RegionServers.
|
||||
see following table.
|
||||
Here are some examples based on the following given case: given two Table objects called test-01
|
||||
and test-02 each with two column family cf1 and cf2 respectively, deployed on 3 RegionServers.
|
||||
See the following table.
|
||||
|
||||
[cols="1,1,1", options="header"]
|
||||
|===
|
||||
|
@ -145,7 +154,7 @@ see following table.
|
|||
| rs3 | r2 | r1
|
||||
|===
|
||||
|
||||
Following are some examples based on the previous given case.
|
||||
Following are some example outputs based on the previous given case.
|
||||
|
||||
==== Canary test for every column family (store) of every region of every table
|
||||
|
||||
|
@ -163,12 +172,13 @@ $ ${HBASE_HOME}/bin/hbase canary
|
|||
13/12/09 03:26:32 INFO tool.Canary: read from region test-02,0004883,1386559511167.cbda32d5e2e276520712d84eaaa29d84. column family cf2 in 8ms
|
||||
----
|
||||
|
||||
So you can see, table test-01 has two regions and two column families, so the Canary tool will pick 4 small piece of data from 4 (2 region * 2 store) different stores.
|
||||
This is a default behavior of the this tool does.
|
||||
So you can see, table test-01 has two regions and two column families, so the Canary tool in the
|
||||
default "region mode" will pick 4 small piece of data from 4 (2 region * 2 store) different stores.
|
||||
This is a default behavior.
|
||||
|
||||
==== Canary test for every column family (store) of every region of specific table(s)
|
||||
==== Canary test for every column family (store) of every region of a specific table(s)
|
||||
|
||||
You can also test one or more specific tables.
|
||||
You can also test one or more specific tables by passing table names.
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary test-01 test-02
|
||||
|
@ -176,7 +186,9 @@ $ ${HBASE_HOME}/bin/hbase canary test-01 test-02
|
|||
|
||||
==== Canary test with RegionServer granularity
|
||||
|
||||
This will pick one small piece of data from each RegionServer, and can also put your RegionServer name as input options for canary-test specific RegionServer.
|
||||
In "regionserver mode", the Canary tool will pick one small piece of data
|
||||
from each RegionServer (You can also pass one or more RegionServer names as arguments
|
||||
to the canary-test when in "regionserver mode").
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -regionserver
|
||||
|
@ -188,22 +200,25 @@ $ ${HBASE_HOME}/bin/hbase canary -regionserver
|
|||
|
||||
==== Canary test with regular expression pattern
|
||||
|
||||
This will test both table test-01 and test-02.
|
||||
You can pass regexes for table names when in "region mode" or for servernames when
|
||||
in "regionserver mode". The below will test both table test-01 and test-02.
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -e test-0[1-2]
|
||||
----
|
||||
|
||||
==== Run canary test as daemon mode
|
||||
==== Run canary test as a "daemon"
|
||||
|
||||
Run repeatedly with interval defined in option `-interval` whose default value is 60 seconds.
|
||||
This daemon will stop itself and return non-zero error code if any error occurs, due to the default value of option -f is true.
|
||||
Run repeatedly with an interval defined via the option `-interval` (default value is 60 seconds).
|
||||
This daemon will stop itself and return non-zero error code if any error occur. To have
|
||||
the daemon keep running across errors, pass the -f flag with its value set to false
|
||||
(see usage above).
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -daemon
|
||||
----
|
||||
|
||||
Run repeatedly with 5 second intervals and will not stop itself even if errors occur in the test.
|
||||
To run repeatedly with 5 second intervals and not stop on errors, do the following.
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false
|
||||
|
@ -211,9 +226,11 @@ $ ${HBASE_HOME}/bin/hbase canary -daemon -interval 5 -f false
|
|||
|
||||
==== Force timeout if canary test stuck
|
||||
|
||||
In some cases the request is stuck and no response is sent back to the client. This can happen with dead RegionServers which the master has not yet noticed.
|
||||
Because of this we provide a timeout option to kill the canary test and return a non-zero error code.
|
||||
This run sets the timeout value to 60 seconds, the default value is 600 seconds.
|
||||
In some cases the request is stuck and no response is sent back to the client. This
|
||||
can happen with dead RegionServers which the master has not yet noticed.
|
||||
Because of this we provide a timeout option to kill the canary test and return a
|
||||
non-zero error code. The below sets the timeout value to 60 seconds (the default value
|
||||
is 600 seconds).
|
||||
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -t 60000
|
||||
|
@ -221,36 +238,37 @@ $ ${HBASE_HOME}/bin/hbase canary -t 60000
|
|||
|
||||
==== Enable write sniffing in canary
|
||||
|
||||
By default, the canary tool only check the read operations, it's hard to find the problem in the
|
||||
write path. To enable the write sniffing, you can run canary with the `-writeSniffing` option.
|
||||
When the write sniffing is enabled, the canary tool will create an hbase table and make sure the
|
||||
regions of the table distributed on all region servers. In each sniffing period, the canary will
|
||||
try to put data to these regions to check the write availability of each region server.
|
||||
By default, the canary tool only checks read operations. To enable the write sniffing,
|
||||
you can run the canary with the `-writeSniffing` option set. When write sniffing is
|
||||
enabled, the canary tool will create an hbase table and make sure the
|
||||
regions of the table are distributed to all region servers. In each sniffing period,
|
||||
the canary will try to put data to these regions to check the write availability of
|
||||
each region server.
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -writeSniffing
|
||||
----
|
||||
|
||||
The default write table is `hbase:canary` and can be specified by the option `-writeTable`.
|
||||
The default write table is `hbase:canary` and can be specified with the option `-writeTable`.
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary
|
||||
----
|
||||
|
||||
The default value size of each put is 10 bytes and you can set it by the config key:
|
||||
The default value size of each put is 10 bytes. You can set it via the config key:
|
||||
`hbase.canary.write.value.size`.
|
||||
|
||||
==== Treat read / write failure as error
|
||||
|
||||
By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException,
|
||||
while returning normal exit code. To treat read / write failure as error, you can run canary
|
||||
with the `-treatFailureAsError` option. When enabled, read / write failure would result in error
|
||||
exit code.
|
||||
By default, the canary tool only logs read failures -- due to e.g. RetriesExhaustedException, etc. --
|
||||
and will return the 'normal' exit code. To treat read/write failure as errors, you can run canary
|
||||
with the `-treatFailureAsError` option. When enabled, read/write failures will result in an
|
||||
error exit code.
|
||||
----
|
||||
$ ${HBASE_HOME}/bin/hbase canary -treatFailureAsError
|
||||
----
|
||||
|
||||
==== Running Canary in a Kerberos-enabled Cluster
|
||||
|
||||
To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:
|
||||
To run the Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:
|
||||
|
||||
* `hbase.client.keytab.file`
|
||||
* `hbase.client.kerberos.principal`
|
||||
|
|
Loading…
Reference in New Issue