From 5cca61c4d0e7f1cf03fb7a80c7611bde4a49214b Mon Sep 17 00:00:00 2001 From: David Manning Date: Wed, 29 Aug 2018 12:06:59 -0700 Subject: [PATCH] HBASE-21126 Configurable number of allowed failures for ZooKeeper Canary Signed-off-by: Josh Elser --- .../org/apache/hadoop/hbase/tool/Canary.java | 52 +++++++++++++++---- .../hadoop/hbase/tool/TestCanaryTool.java | 35 ++++++++----- 2 files changed, 63 insertions(+), 24 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java index ae6324f71b6..7a549fce22c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java @@ -586,6 +586,7 @@ public final class Canary implements Tool { private boolean failOnError = true; private boolean regionServerMode = false; private boolean zookeeperMode = false; + private long permittedFailures = 0; private boolean regionServerAllRegions = false; private boolean writeSniffing = false; private long configuredWriteTableTimeout = DEFAULT_TIMEOUT; @@ -729,6 +730,19 @@ public final class Canary implements Tool { } this.configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal); } + } else if (cmd.equals("-permittedZookeeperFailures")) { + i++; + + if (i == args.length) { + System.err.println("-permittedZookeeperFailures needs a numeric value argument."); + printUsageAndExit(); + } + try { + this.permittedFailures = Long.parseLong(args[i]); + } catch (NumberFormatException e) { + System.err.println("-permittedZookeeperFailures needs a numeric value argument."); + printUsageAndExit(); + } } else { // no options match System.err.println(cmd + " options is invalid."); @@ -750,6 +764,10 @@ public final class Canary implements Tool { printUsageAndExit(); } } + if (this.permittedFailures != 0 && !this.zookeeperMode) { + System.err.println("-permittedZookeeperFailures requires -zookeeper mode."); + printUsageAndExit(); + } if (!this.configuredReadTableTimeouts.isEmpty() && (this.regionServerMode || this.zookeeperMode)) { System.err.println("-readTableTimeouts can only be configured in region mode."); printUsageAndExit(); @@ -847,6 +865,8 @@ public final class Canary implements Tool { System.err.println(" only works in regionserver mode."); System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent "); System.err.println(" on each zookeeper instance"); + System.err.println(" -permittedZookeeperFailures Ignore first N failures when attempting to "); + System.err.println(" connect to individual zookeeper nodes in the ensemble"); System.err.println(" -daemon Continuous check at defined intervals."); System.err.println(" -interval Interval between checks (sec)"); System.err.println(" -e Use table/regionserver as regular expression"); @@ -889,17 +909,18 @@ public final class Canary implements Tool { monitor = new RegionServerMonitor(connection, monitorTargets, this.useRegExp, (StdOutSink) this.sink, this.executor, this.regionServerAllRegions, - this.treatFailureAsError); + this.treatFailureAsError, this.permittedFailures); } else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) { monitor = new ZookeeperMonitor(connection, monitorTargets, this.useRegExp, - (StdOutSink) this.sink, this.executor, this.treatFailureAsError); + (StdOutSink) this.sink, this.executor, this.treatFailureAsError, + this.permittedFailures); } else { monitor = new RegionMonitor(connection, monitorTargets, this.useRegExp, (StdOutSink) this.sink, this.executor, this.writeSniffing, this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts, - this.configuredWriteTableTimeout); + this.configuredWriteTableTimeout, this.permittedFailures); } return monitor; } @@ -916,6 +937,7 @@ public final class Canary implements Tool { protected boolean done = false; protected int errorCode = 0; + protected long allowedFailures = 0; protected Sink sink; protected ExecutorService executor; @@ -932,7 +954,8 @@ public final class Canary implements Tool { return true; } if (treatFailureAsError && - (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0)) { + (sink.getReadFailureCount() > allowedFailures || sink.getWriteFailureCount() > allowedFailures)) { + LOG.error("Too many failures detected, treating failure as error, failing the Canary."); errorCode = FAILURE_EXIT_CODE; return true; } @@ -945,7 +968,7 @@ public final class Canary implements Tool { } protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink, - ExecutorService executor, boolean treatFailureAsError) { + ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { if (null == connection) throw new IllegalArgumentException("connection shall not be null"); this.connection = connection; @@ -954,6 +977,7 @@ public final class Canary implements Tool { this.treatFailureAsError = treatFailureAsError; this.sink = sink; this.executor = executor; + this.allowedFailures = allowedFailures; } @Override @@ -995,8 +1019,9 @@ public final class Canary implements Tool { public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, - boolean treatFailureAsError, HashMap configuredReadTableTimeouts, long configuredWriteTableTimeout) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); + boolean treatFailureAsError, HashMap configuredReadTableTimeouts, long configuredWriteTableTimeout, + long allowedFailures) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); Configuration conf = connection.getConfiguration(); this.writeSniffing = writeSniffing; this.writeTableName = writeTableName; @@ -1289,8 +1314,8 @@ public final class Canary implements Tool { private final int timeout; protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - StdOutSink sink, ExecutorService executor, boolean treatFailureAsError) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); + StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); Configuration configuration = connection.getConfiguration(); znode = configuration.get(ZOOKEEPER_ZNODE_PARENT, @@ -1303,6 +1328,11 @@ public final class Canary implements Tool { for (InetSocketAddress server : parser.getServerAddresses()) { hosts.add(server.toString()); } + if (allowedFailures > (hosts.size() - 1) / 2) { + LOG.warn("Confirm allowable number of failed ZooKeeper nodes, as quorum will " + + "already be lost. Setting of {} failures is unexpected for {} ensemble size.", + allowedFailures, hosts.size()); + } } @Override public void run() { @@ -1351,8 +1381,8 @@ public final class Canary implements Tool { public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, StdOutSink sink, ExecutorService executor, boolean allRegions, - boolean treatFailureAsError) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); + boolean treatFailureAsError, long allowedFailures) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); this.allRegions = allRegions; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java index e713a5af81e..cdbf42623c7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java @@ -90,20 +90,14 @@ public class TestCanaryTool { @Test public void testBasicZookeeperCanaryWorks() throws Exception { - Integer port = - Iterables.getOnlyElement(testingUtility.getZkCluster().getClientPortList(), null); - testingUtility.getConfiguration().set(HConstants.ZOOKEEPER_QUORUM, - "localhost:" + port + "/hbase"); - ExecutorService executor = new ScheduledThreadPoolExecutor(2); - Canary.ZookeeperStdOutSink sink = spy(new Canary.ZookeeperStdOutSink()); - Canary canary = new Canary(executor, sink); - String[] args = { "-t", "10000", "-zookeeper" }; - assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); + final String[] args = { "-t", "10000", "-zookeeper" }; + testZookeeperCanaryWithArgs(args); + } - String baseZnode = testingUtility.getConfiguration() - .get(HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT); - verify(sink, atLeastOnce()) - .publishReadTiming(eq(baseZnode), eq("localhost:" + port), anyLong()); + @Test + public void testZookeeperCanaryPermittedFailuresArgumentWorks() throws Exception { + final String[] args = { "-t", "10000", "-zookeeper", "-treatFailureAsError", "-permittedZookeeperFailures", "1" }; + testZookeeperCanaryWithArgs(args); } @Test @@ -250,4 +244,19 @@ public class TestCanaryTool { assertEquals("verify no read error count", 0, canary.getReadFailures().size()); } + private void testZookeeperCanaryWithArgs(String[] args) throws Exception { + Integer port = + Iterables.getOnlyElement(testingUtility.getZkCluster().getClientPortList(), null); + testingUtility.getConfiguration().set(HConstants.ZOOKEEPER_QUORUM, + "localhost:" + port + "/hbase"); + ExecutorService executor = new ScheduledThreadPoolExecutor(2); + Canary.ZookeeperStdOutSink sink = spy(new Canary.ZookeeperStdOutSink()); + Canary canary = new Canary(executor, sink); + assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); + + String baseZnode = testingUtility.getConfiguration() + .get(HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT); + verify(sink, atLeastOnce()) + .publishReadTiming(eq(baseZnode), eq("localhost:" + port), anyLong()); + } }