HBASE-24327 : Flaky connection in TestMasterShutdown#testMasterShutdo… (#1690)

Signed-off-by: Bharath Vissapragada <bharathv@apache.org>
This commit is contained in:
Viraj Jasani 2020-05-13 02:38:47 +05:30
parent ceb55c654d
commit 8f245aa8f8
No known key found for this signature in database
GPG Key ID: 3AE697641452FC5D
1 changed files with 46 additions and 28 deletions

View File

@ -22,6 +22,8 @@ import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics; import org.apache.hadoop.hbase.ClusterMetrics;
@ -31,7 +33,6 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.LocalHBaseCluster; import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.StartMiniClusterOption; import org.apache.hadoop.hbase.StartMiniClusterOption;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
@ -50,7 +51,7 @@ public class TestMasterShutdown {
@ClassRule @ClassRule
public static final HBaseClassTestRule CLASS_RULE = public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestMasterShutdown.class); HBaseClassTestRule.forClass(TestMasterShutdown.class);
private HBaseTestingUtility htu; private HBaseTestingUtility htu;
@ -127,7 +128,7 @@ public class TestMasterShutdown {
public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception { public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception {
LocalHBaseCluster hbaseCluster = null; LocalHBaseCluster hbaseCluster = null;
try { try {
htu = new HBaseTestingUtility( htu = new HBaseTestingUtility(
createMasterShutdownBeforeStartingAnyRegionServerConfiguration()); createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
// configure a cluster with // configure a cluster with
@ -151,19 +152,46 @@ public class TestMasterShutdown {
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(), hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
options.getNumRegionServers(), options.getMasterClass(), options.getRsClass()); options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
final MasterThread masterThread = hbaseCluster.getMasters().get(0); final MasterThread masterThread = hbaseCluster.getMasters().get(0);
masterThread.start(); masterThread.start();
// Switching to master registry exacerbated a race in the master bootstrap that can result final CompletableFuture<Void> shutdownFuture = CompletableFuture.runAsync(() -> {
// in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because // Switching to master registry exacerbated a race in the master bootstrap that can result
// the server manager in HMaster is not initialized by the time shutdown() RPC (below) is // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
// made to the master. The suspected reason as to why it was uncommon before HBASE-18095 // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
// is because the connection creation with ZK registry is so slow that by then the server // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
// manager is usually init'ed in time for the RPC to be made. For now, adding an explicit // is because the connection creation with ZK registry is so slow that by then the server
// wait() in the test, waiting for the server manager to become available. // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
final long timeout = TimeUnit.MINUTES.toMillis(10); // wait() in the test, waiting for the server manager to become available.
assertNotEquals("Timeout waiting for server manager to become available.", final long timeout = TimeUnit.MINUTES.toMillis(10);
-1, Waiter.waitFor(htu.getConfiguration(), timeout, assertNotEquals("timeout waiting for server manager to become available.", -1,
() -> masterThread.getMaster().getServerManager() != null)); htu.waitFor(timeout, () -> masterThread.getMaster().getServerManager() != null));
htu.getConnection().getAdmin().shutdown();
// Master has come up far enough that we can terminate it without creating a zombie.
LOG.debug("Attempting to establish connection.");
try {
// HBASE-24327 : (Resolve Flaky connection issues)
// shutdown() RPC can have flaky ZK connection issues.
// e.g
// ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033]
// master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK
// org.apache.zookeeper.KeeperException$SystemErrorException:
// KeeperErrorCode = SystemError
//
// However, even when above flakes happen, shutdown call does get completed even if
// RPC call has failure. Hence, subsequent retries will never succeed as HMaster is
// already shutdown. Hence, it can fail. To resolve it, after making one shutdown()
// call, we are ignoring IOException.
htu.getConnection().getAdmin().shutdown();
LOG.info("Shutdown RPC sent.");
} catch (IOException | CompletionException e) {
LOG.warn("Failed to establish connection.", e);
} catch (Throwable e) {
LOG.warn("Something unexpected happened.", e);
throw new RuntimeException(e);
}
});
shutdownFuture.join();
masterThread.join(); masterThread.join();
} finally { } finally {
if (hbaseCluster != null) { if (hbaseCluster != null) {
@ -186,19 +214,9 @@ public class TestMasterShutdown {
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
// don't need a long write pipeline for this test. // don't need a long write pipeline for this test.
conf.setInt("dfs.replication", 1); conf.setInt("dfs.replication", 1);
return conf; // reduce client retries
} conf.setInt("hbase.client.retries.number", 3);
// Recoverable ZK configs are tuned more aggressively
/**
* Create a new {@link Configuration} based on {@code baseConf} that has ZooKeeper connection
* settings tuned very aggressively. The resulting client is used within a retry loop, so there's
* no value in having the client itself do the retries. We want to iterate on the base
* configuration because we're waiting for the mini-cluster to start and set it's ZK client port.
*
* @return a new, configured {@link Configuration} instance.
*/
private static Configuration createResponsiveZkConfig(final Configuration baseConf) {
final Configuration conf = HBaseConfiguration.create(baseConf);
conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY, 3); conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY, 3);
conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY_INTERVAL_MILLIS, 100); conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY_INTERVAL_MILLIS, 100);
return conf; return conf;