From 5fe081eb33caad5f76084591678fd9365949b357 Mon Sep 17 00:00:00 2001 From: tedyu Date: Wed, 10 Feb 2016 02:38:12 -0800 Subject: [PATCH] HBASE-15219 Canary tool does not return non-zero exit code when one of regions is in stuck state --- .../org/apache/hadoop/hbase/tool/Canary.java | 55 ++++++++++++++++--- src/main/asciidoc/_chapters/ops_mgt.adoc | 11 ++++ 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java index 055e97e7fcd..061a14fb684 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java @@ -92,9 +92,11 @@ import org.apache.hadoop.util.ToolRunner; public final class Canary implements Tool { // Sink interface used by the canary to outputs information public interface Sink { + public long getReadFailureCount(); public void publishReadFailure(HRegionInfo region, Exception e); public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e); public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime); + public long getWriteFailureCount(); public void publishWriteFailure(HRegionInfo region, Exception e); public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e); public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime); @@ -109,13 +111,23 @@ public final class Canary implements Tool { // Simple implementation of canary sink that allows to plot on // file or standard output timings or failures. public static class StdOutSink implements Sink { + protected AtomicLong readFailureCount = new AtomicLong(0), + writeFailureCount = new AtomicLong(0); + + @Override + public long getReadFailureCount() { + return readFailureCount.get(); + } + @Override public void publishReadFailure(HRegionInfo region, Exception e) { + readFailureCount.incrementAndGet(); LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()), e); } @Override public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e) { + readFailureCount.incrementAndGet(); LOG.error(String.format("read from region %s column family %s failed", region.getRegionNameAsString(), column.getNameAsString()), e); } @@ -126,13 +138,20 @@ public final class Canary implements Tool { region.getRegionNameAsString(), column.getNameAsString(), msTime)); } + @Override + public long getWriteFailureCount() { + return writeFailureCount.get(); + } + @Override public void publishWriteFailure(HRegionInfo region, Exception e) { + writeFailureCount.incrementAndGet(); LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()), e); } @Override public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e) { + writeFailureCount.incrementAndGet(); LOG.error(String.format("write to region %s column family %s failed", region.getRegionNameAsString(), column.getNameAsString()), e); } @@ -148,6 +167,7 @@ public final class Canary implements Tool { @Override public void publishReadFailure(String table, String server) { + readFailureCount.incrementAndGet(); LOG.error(String.format("Read from table:%s on region server:%s", table, server)); } @@ -432,6 +452,7 @@ public final class Canary implements Tool { private boolean regionServerMode = false; private boolean regionServerAllRegions = false; private boolean writeSniffing = false; + private boolean treatFailureAsError = false; private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME; private ExecutorService executor; // threads to retrieve data from regionservers @@ -495,6 +516,8 @@ public final class Canary implements Tool { this.regionServerAllRegions = true; } else if(cmd.equals("-writeSniffing")) { this.writeSniffing = true; + } else if(cmd.equals("-treatFailureAsError")) { + this.treatFailureAsError = true; } else if (cmd.equals("-e")) { this.useRegExp = true; } else if (cmd.equals("-t")) { @@ -600,7 +623,7 @@ public final class Canary implements Tool { } } - if (this.failOnError && monitor.hasError()) { + if (this.failOnError && monitor.finalCheckForErrors()) { monitorThread.interrupt(); System.exit(monitor.errorCode); } @@ -636,6 +659,7 @@ public final class Canary implements Tool { " default is true"); System.err.println(" -t timeout for a check, default is 600000 (milisecs)"); System.err.println(" -writeSniffing enable the write sniffing in canary"); + System.err.println(" -treatFailureAsError treats read / write failure as error"); System.err.println(" -writeTable The table used for write sniffing." + " Default is hbase:canary"); System.err @@ -663,11 +687,12 @@ public final class Canary implements Tool { if (this.regionServerMode) { monitor = new RegionServerMonitor(connection, monitorTargets, this.useRegExp, - (ExtendedSink) this.sink, this.executor, this.regionServerAllRegions); + (ExtendedSink) this.sink, this.executor, this.regionServerAllRegions, + this.treatFailureAsError); } else { monitor = new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor, - this.writeSniffing, this.writeTableName); + this.writeSniffing, this.writeTableName, this.treatFailureAsError); } return monitor; } @@ -679,6 +704,7 @@ public final class Canary implements Tool { protected Admin admin; protected String[] targets; protected boolean useRegExp; + protected boolean treatFailureAsError; protected boolean initialized = false; protected boolean done = false; @@ -694,18 +720,27 @@ public final class Canary implements Tool { return errorCode != 0; } + public boolean finalCheckForErrors() { + if (errorCode != 0) { + return true; + } + return treatFailureAsError && + (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0); + } + @Override public void close() throws IOException { if (this.admin != null) this.admin.close(); } protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink, - ExecutorService executor) { + ExecutorService executor, boolean treatFailureAsError) { if (null == connection) throw new IllegalArgumentException("connection shall not be null"); this.connection = connection; this.targets = monitorTargets; this.useRegExp = useRegExp; + this.treatFailureAsError = treatFailureAsError; this.sink = sink; this.executor = executor; } @@ -745,8 +780,9 @@ public final class Canary implements Tool { private int checkPeriod; public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName) { - super(connection, monitorTargets, useRegExp, sink, executor); + Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, + boolean treatFailureAsError) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); Configuration conf = connection.getConfiguration(); this.writeSniffing = writeSniffing; this.writeTableName = writeTableName; @@ -1054,8 +1090,9 @@ public final class Canary implements Tool { private boolean allRegions; public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - ExtendedSink sink, ExecutorService executor, boolean allRegions) { - super(connection, monitorTargets, useRegExp, sink, executor); + ExtendedSink sink, ExecutorService executor, boolean allRegions, + boolean treatFailureAsError) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); this.allRegions = allRegions; } @@ -1150,7 +1187,7 @@ public final class Canary implements Tool { } } catch (InterruptedException e) { this.errorCode = ERROR_EXIT_CODE; - LOG.error("Sniff regionserver failed!", e); + LOG.error("Sniff regionserver interrupted!", e); } } diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 3984a0da38e..fc91a18b3c8 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -93,6 +93,7 @@ Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...] -f stop whole program if first error occurs, default is true -t timeout for a check, default is 600000 (milliseconds) -writeSniffing enable the write sniffing in canary + -treatFailureAsError treats read / write failure as error -writeTable The table used for write sniffing. Default is hbase:canary -D= assigning or override the configuration params ---- @@ -215,6 +216,16 @@ $ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary The default value size of each put is 10 bytes and you can set it by the config key: `hbase.canary.write.value.size`. +==== Treat read / write failure as error + +By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException, +while returning normal exit code. To treat read / write failure as error, you can run canary +with the `-treatFailureAsError` option. When enabled, read / write failure would result in error +exit code. +---- +$ ${HBASE_HOME}/bin/hbase canary --treatFailureAsError +---- + ==== Running Canary in a Kerberos-enabled Cluster To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_: