HBASE-24052 Add debug+fix to TestMasterShutdown

Add check for stopped server at a few more points in Master startup.
Defend against NPE in RSProcedureDispatcher; log and retun instead.
This commit is contained in:
stack 2020-03-27 09:36:05 -07:00
parent d318ca1741
commit 030e833dc9
4 changed files with 53 additions and 66 deletions

View File

@ -2793,6 +2793,7 @@ public class HMaster extends HRegionServer implements MasterServices {
@Override @Override
public void abort(String reason, Throwable cause) { public void abort(String reason, Throwable cause) {
if (isAborted() || isStopped()) { if (isAborted() || isStopped()) {
LOG.debug("Abort called but aborted={}, stopped={}", isAborted(), isStopped());
return; return;
} }
setAbortRequested(); setAbortRequested();

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.ServerListener; import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException; import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException; import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
@ -95,10 +96,24 @@ public class RSProcedureDispatcher
return false; return false;
} }
// Around startup, if failed, some of the below may be set back to null so NPE is possible. // Around startup, if failed, some of the below may be set back to null so NPE is possible.
ServerManager sm = master.getServerManager();
if (sm == null) {
LOG.debug("ServerManager is null; stopping={}", master.isStopping());
return false;
}
sm.registerListener(this);
ProcedureExecutor<MasterProcedureEnv> pe = master.getMasterProcedureExecutor();
if (pe == null) {
LOG.debug("ProcedureExecutor is null; stopping={}", master.isStopping());
return false;
}
procedureEnv = pe.getEnvironment();
if (this.procedureEnv == null) {
LOG.debug("ProcedureEnv is null; stopping={}", master.isStopping());
return false;
}
try { try {
master.getServerManager().registerListener(this); for (ServerName serverName : sm.getOnlineServersList()) {
procedureEnv = master.getMasterProcedureExecutor().getEnvironment();
for (ServerName serverName : master.getServerManager().getOnlineServersList()) {
addNode(serverName); addNode(serverName);
} }
} catch (Exception e) { } catch (Exception e) {

View File

@ -942,6 +942,10 @@ public class HRegionServer extends HasThread implements
*/ */
@Override @Override
public void run() { public void run() {
if (isStopped()) {
LOG.info("Skipping run; stopped");
return;
}
try { try {
// Do pre-registration initializations; zookeeper, lease threads, etc. // Do pre-registration initializations; zookeeper, lease threads, etc.
preRegistrationInitialization(); preRegistrationInitialization();
@ -955,24 +959,25 @@ public class HRegionServer extends HasThread implements
// Initialize the RegionServerCoprocessorHost now that our ephemeral // Initialize the RegionServerCoprocessorHost now that our ephemeral
// node was created, in case any coprocessors want to use ZooKeeper // node was created, in case any coprocessors want to use ZooKeeper
this.rsHost = new RegionServerCoprocessorHost(this, this.conf); this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
}
// Try and register with the Master; tell it we are here. Break if server is stopped or the // Try and register with the Master; tell it we are here. Break if server is stopped or
// clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start // the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
// up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. // start up all Services. Use RetryCounter to get backoff in case Master is struggling to
LOG.debug("About to register with Master."); // come up.
RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, LOG.debug("About to register with Master.");
this.sleeper.getPeriod(), 1000 * 60 * 5); RetryCounterFactory rcf =
RetryCounter rc = rcf.create(); new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
while (keepLooping()) { RetryCounter rc = rcf.create();
RegionServerStartupResponse w = reportForDuty(); while (keepLooping()) {
if (w == null) { RegionServerStartupResponse w = reportForDuty();
long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); if (w == null) {
LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
this.sleeper.sleep(sleepTime); LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime);
} else { this.sleeper.sleep(sleepTime);
handleReportForDutyResponse(w); } else {
break; handleReportForDutyResponse(w);
break;
}
} }
} }

View File

@ -21,10 +21,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.time.Duration;
import java.util.List; import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics; import org.apache.hadoop.hbase.ClusterMetrics;
@ -35,8 +32,6 @@ import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.StartMiniClusterOption; import org.apache.hadoop.hbase.StartMiniClusterOption;
import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.client.AsyncConnection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
@ -156,48 +151,19 @@ public class TestMasterShutdown {
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(), hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
options.getNumRegionServers(), options.getMasterClass(), options.getRsClass()); options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
final MasterThread masterThread = hbaseCluster.getMasters().get(0); final MasterThread masterThread = hbaseCluster.getMasters().get(0);
final CompletableFuture<Void> shutdownFuture = CompletableFuture.runAsync(() -> {
// Switching to master registry exacerbated a race in the master bootstrap that can result
// in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
// the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
// made to the master. The suspected reason as to why it was uncommon before HBASE-18095
// is because the connection creation with ZK registry is so slow that by then the server
// manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
// wait() in the test, waiting for the server manager to become available.
final long timeout = TimeUnit.MINUTES.toMillis(10);
assertNotEquals("timeout waiting for server manager to become available.",
-1, Waiter.waitFor(htu.getConfiguration(), timeout,
() -> masterThread.getMaster().getServerManager() != null));
// Master has come up far enough that we can terminate it without creating a zombie.
final long result = Waiter.waitFor(htu.getConfiguration(), timeout, 500, () -> {
final Configuration conf = createResponsiveZkConfig(htu.getConfiguration());
LOG.debug("Attempting to establish connection.");
final CompletableFuture<AsyncConnection> connFuture =
ConnectionFactory.createAsyncConnection(conf);
try (final AsyncConnection conn = connFuture.join()) {
LOG.debug("Sending shutdown RPC.");
try {
conn.getAdmin().shutdown().join();
LOG.debug("Shutdown RPC sent.");
return true;
} catch (CompletionException e) {
LOG.debug("Failure sending shutdown RPC.");
}
} catch (IOException|CompletionException e) {
LOG.debug("Failed to establish connection.");
} catch (Throwable e) {
LOG.info("Something unexpected happened.", e);
}
return false;
});
assertNotEquals("Failed to issue shutdown RPC after " + Duration.ofMillis(timeout),
-1, result);
});
masterThread.start(); masterThread.start();
shutdownFuture.join(); // Switching to master registry exacerbated a race in the master bootstrap that can result
// in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
// the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
// made to the master. The suspected reason as to why it was uncommon before HBASE-18095
// is because the connection creation with ZK registry is so slow that by then the server
// manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
// wait() in the test, waiting for the server manager to become available.
final long timeout = TimeUnit.MINUTES.toMillis(10);
assertNotEquals("Timeout waiting for server manager to become available.",
-1, Waiter.waitFor(htu.getConfiguration(), timeout,
() -> masterThread.getMaster().getServerManager() != null));
htu.getConnection().getAdmin().shutdown();
masterThread.join(); masterThread.join();
} finally { } finally {
if (hbaseCluster != null) { if (hbaseCluster != null) {