From a782531633a70394ec69f40a06eb4b94659386a1 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Wed, 13 May 2020 02:38:47 +0530 Subject: [PATCH] =?UTF-8?q?HBASE-24327=20:=20Flaky=20connection=20in=20Tes?= =?UTF-8?q?tMasterShutdown#testMasterShutdo=E2=80=A6=20(#1690)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Bharath Vissapragada --- .../hbase/master/TestMasterShutdown.java | 74 ++++++++++++------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java index 7b3921e3ecb..e99e93204d1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java @@ -22,6 +22,8 @@ import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import java.io.IOException; import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterMetrics; @@ -31,7 +33,6 @@ import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.LocalHBaseCluster; import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.StartMiniClusterOption; -import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; @@ -50,7 +51,7 @@ public class TestMasterShutdown { @ClassRule public static final HBaseClassTestRule CLASS_RULE = - HBaseClassTestRule.forClass(TestMasterShutdown.class); + HBaseClassTestRule.forClass(TestMasterShutdown.class); private HBaseTestingUtility htu; @@ -127,7 +128,7 @@ public class TestMasterShutdown { public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception { LocalHBaseCluster hbaseCluster = null; try { - htu = new HBaseTestingUtility( + htu = new HBaseTestingUtility( createMasterShutdownBeforeStartingAnyRegionServerConfiguration()); // configure a cluster with @@ -151,19 +152,46 @@ public class TestMasterShutdown { hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(), options.getNumRegionServers(), options.getMasterClass(), options.getRsClass()); final MasterThread masterThread = hbaseCluster.getMasters().get(0); + masterThread.start(); - // Switching to master registry exacerbated a race in the master bootstrap that can result - // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because - // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is - // made to the master. The suspected reason as to why it was uncommon before HBASE-18095 - // is because the connection creation with ZK registry is so slow that by then the server - // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit - // wait() in the test, waiting for the server manager to become available. - final long timeout = TimeUnit.MINUTES.toMillis(10); - assertNotEquals("Timeout waiting for server manager to become available.", - -1, Waiter.waitFor(htu.getConfiguration(), timeout, - () -> masterThread.getMaster().getServerManager() != null)); - htu.getConnection().getAdmin().shutdown(); + final CompletableFuture shutdownFuture = CompletableFuture.runAsync(() -> { + // Switching to master registry exacerbated a race in the master bootstrap that can result + // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because + // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is + // made to the master. The suspected reason as to why it was uncommon before HBASE-18095 + // is because the connection creation with ZK registry is so slow that by then the server + // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit + // wait() in the test, waiting for the server manager to become available. + final long timeout = TimeUnit.MINUTES.toMillis(10); + assertNotEquals("timeout waiting for server manager to become available.", -1, + htu.waitFor(timeout, () -> masterThread.getMaster().getServerManager() != null)); + + // Master has come up far enough that we can terminate it without creating a zombie. + LOG.debug("Attempting to establish connection."); + try { + // HBASE-24327 : (Resolve Flaky connection issues) + // shutdown() RPC can have flaky ZK connection issues. + // e.g + // ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033] + // master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK + // org.apache.zookeeper.KeeperException$SystemErrorException: + // KeeperErrorCode = SystemError + // + // However, even when above flakes happen, shutdown call does get completed even if + // RPC call has failure. Hence, subsequent retries will never succeed as HMaster is + // already shutdown. Hence, it can fail. To resolve it, after making one shutdown() + // call, we are ignoring IOException. + htu.getConnection().getAdmin().shutdown(); + LOG.info("Shutdown RPC sent."); + } catch (IOException | CompletionException e) { + LOG.warn("Failed to establish connection.", e); + } catch (Throwable e) { + LOG.warn("Something unexpected happened.", e); + throw new RuntimeException(e); + } + }); + + shutdownFuture.join(); masterThread.join(); } finally { if (hbaseCluster != null) { @@ -186,19 +214,9 @@ public class TestMasterShutdown { conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); // don't need a long write pipeline for this test. conf.setInt("dfs.replication", 1); - return conf; - } - - /** - * Create a new {@link Configuration} based on {@code baseConf} that has ZooKeeper connection - * settings tuned very aggressively. The resulting client is used within a retry loop, so there's - * no value in having the client itself do the retries. We want to iterate on the base - * configuration because we're waiting for the mini-cluster to start and set it's ZK client port. - * - * @return a new, configured {@link Configuration} instance. - */ - private static Configuration createResponsiveZkConfig(final Configuration baseConf) { - final Configuration conf = HBaseConfiguration.create(baseConf); + // reduce client retries + conf.setInt("hbase.client.retries.number", 3); + // Recoverable ZK configs are tuned more aggressively conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY, 3); conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY_INTERVAL_MILLIS, 100); return conf;