HBASE-24052 Add debug+fix to TestMasterShutdown
Add check for stopped server at a few more points in Master startup. Defend against NPE in RSProcedureDispatcher; log and retun instead.
This commit is contained in:
parent
d318ca1741
commit
030e833dc9
|
@ -2793,6 +2793,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
@Override
|
@Override
|
||||||
public void abort(String reason, Throwable cause) {
|
public void abort(String reason, Throwable cause) {
|
||||||
if (isAborted() || isStopped()) {
|
if (isAborted() || isStopped()) {
|
||||||
|
LOG.debug("Abort called but aborted={}, stopped={}", isAborted(), isStopped());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
setAbortRequested();
|
setAbortRequested();
|
||||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
|
||||||
import org.apache.hadoop.hbase.master.MasterServices;
|
import org.apache.hadoop.hbase.master.MasterServices;
|
||||||
import org.apache.hadoop.hbase.master.ServerListener;
|
import org.apache.hadoop.hbase.master.ServerListener;
|
||||||
import org.apache.hadoop.hbase.master.ServerManager;
|
import org.apache.hadoop.hbase.master.ServerManager;
|
||||||
|
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
|
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
|
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
|
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
|
||||||
|
@ -95,10 +96,24 @@ public class RSProcedureDispatcher
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Around startup, if failed, some of the below may be set back to null so NPE is possible.
|
// Around startup, if failed, some of the below may be set back to null so NPE is possible.
|
||||||
|
ServerManager sm = master.getServerManager();
|
||||||
|
if (sm == null) {
|
||||||
|
LOG.debug("ServerManager is null; stopping={}", master.isStopping());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
sm.registerListener(this);
|
||||||
|
ProcedureExecutor<MasterProcedureEnv> pe = master.getMasterProcedureExecutor();
|
||||||
|
if (pe == null) {
|
||||||
|
LOG.debug("ProcedureExecutor is null; stopping={}", master.isStopping());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
procedureEnv = pe.getEnvironment();
|
||||||
|
if (this.procedureEnv == null) {
|
||||||
|
LOG.debug("ProcedureEnv is null; stopping={}", master.isStopping());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
master.getServerManager().registerListener(this);
|
for (ServerName serverName : sm.getOnlineServersList()) {
|
||||||
procedureEnv = master.getMasterProcedureExecutor().getEnvironment();
|
|
||||||
for (ServerName serverName : master.getServerManager().getOnlineServersList()) {
|
|
||||||
addNode(serverName);
|
addNode(serverName);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
|
|
@ -942,6 +942,10 @@ public class HRegionServer extends HasThread implements
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
|
if (isStopped()) {
|
||||||
|
LOG.info("Skipping run; stopped");
|
||||||
|
return;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
// Do pre-registration initializations; zookeeper, lease threads, etc.
|
// Do pre-registration initializations; zookeeper, lease threads, etc.
|
||||||
preRegistrationInitialization();
|
preRegistrationInitialization();
|
||||||
|
@ -955,24 +959,25 @@ public class HRegionServer extends HasThread implements
|
||||||
// Initialize the RegionServerCoprocessorHost now that our ephemeral
|
// Initialize the RegionServerCoprocessorHost now that our ephemeral
|
||||||
// node was created, in case any coprocessors want to use ZooKeeper
|
// node was created, in case any coprocessors want to use ZooKeeper
|
||||||
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
|
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
|
||||||
}
|
|
||||||
|
|
||||||
// Try and register with the Master; tell it we are here. Break if server is stopped or the
|
// Try and register with the Master; tell it we are here. Break if server is stopped or
|
||||||
// clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start
|
// the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
|
||||||
// up all Services. Use RetryCounter to get backoff in case Master is struggling to come up.
|
// start up all Services. Use RetryCounter to get backoff in case Master is struggling to
|
||||||
LOG.debug("About to register with Master.");
|
// come up.
|
||||||
RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE,
|
LOG.debug("About to register with Master.");
|
||||||
this.sleeper.getPeriod(), 1000 * 60 * 5);
|
RetryCounterFactory rcf =
|
||||||
RetryCounter rc = rcf.create();
|
new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
|
||||||
while (keepLooping()) {
|
RetryCounter rc = rcf.create();
|
||||||
RegionServerStartupResponse w = reportForDuty();
|
while (keepLooping()) {
|
||||||
if (w == null) {
|
RegionServerStartupResponse w = reportForDuty();
|
||||||
long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
|
if (w == null) {
|
||||||
LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime);
|
long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
|
||||||
this.sleeper.sleep(sleepTime);
|
LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime);
|
||||||
} else {
|
this.sleeper.sleep(sleepTime);
|
||||||
handleReportForDutyResponse(w);
|
} else {
|
||||||
break;
|
handleReportForDutyResponse(w);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,10 +21,7 @@ import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotEquals;
|
import static org.junit.Assert.assertNotEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.time.Duration;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.CompletableFuture;
|
|
||||||
import java.util.concurrent.CompletionException;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hbase.ClusterMetrics;
|
import org.apache.hadoop.hbase.ClusterMetrics;
|
||||||
|
@ -35,8 +32,6 @@ import org.apache.hadoop.hbase.LocalHBaseCluster;
|
||||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||||
import org.apache.hadoop.hbase.StartMiniClusterOption;
|
import org.apache.hadoop.hbase.StartMiniClusterOption;
|
||||||
import org.apache.hadoop.hbase.Waiter;
|
import org.apache.hadoop.hbase.Waiter;
|
||||||
import org.apache.hadoop.hbase.client.AsyncConnection;
|
|
||||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
|
||||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||||
|
@ -156,48 +151,19 @@ public class TestMasterShutdown {
|
||||||
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
|
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
|
||||||
options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
|
options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
|
||||||
final MasterThread masterThread = hbaseCluster.getMasters().get(0);
|
final MasterThread masterThread = hbaseCluster.getMasters().get(0);
|
||||||
|
|
||||||
final CompletableFuture<Void> shutdownFuture = CompletableFuture.runAsync(() -> {
|
|
||||||
// Switching to master registry exacerbated a race in the master bootstrap that can result
|
|
||||||
// in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
|
|
||||||
// the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
|
|
||||||
// made to the master. The suspected reason as to why it was uncommon before HBASE-18095
|
|
||||||
// is because the connection creation with ZK registry is so slow that by then the server
|
|
||||||
// manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
|
|
||||||
// wait() in the test, waiting for the server manager to become available.
|
|
||||||
final long timeout = TimeUnit.MINUTES.toMillis(10);
|
|
||||||
assertNotEquals("timeout waiting for server manager to become available.",
|
|
||||||
-1, Waiter.waitFor(htu.getConfiguration(), timeout,
|
|
||||||
() -> masterThread.getMaster().getServerManager() != null));
|
|
||||||
|
|
||||||
// Master has come up far enough that we can terminate it without creating a zombie.
|
|
||||||
final long result = Waiter.waitFor(htu.getConfiguration(), timeout, 500, () -> {
|
|
||||||
final Configuration conf = createResponsiveZkConfig(htu.getConfiguration());
|
|
||||||
LOG.debug("Attempting to establish connection.");
|
|
||||||
final CompletableFuture<AsyncConnection> connFuture =
|
|
||||||
ConnectionFactory.createAsyncConnection(conf);
|
|
||||||
try (final AsyncConnection conn = connFuture.join()) {
|
|
||||||
LOG.debug("Sending shutdown RPC.");
|
|
||||||
try {
|
|
||||||
conn.getAdmin().shutdown().join();
|
|
||||||
LOG.debug("Shutdown RPC sent.");
|
|
||||||
return true;
|
|
||||||
} catch (CompletionException e) {
|
|
||||||
LOG.debug("Failure sending shutdown RPC.");
|
|
||||||
}
|
|
||||||
} catch (IOException|CompletionException e) {
|
|
||||||
LOG.debug("Failed to establish connection.");
|
|
||||||
} catch (Throwable e) {
|
|
||||||
LOG.info("Something unexpected happened.", e);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
assertNotEquals("Failed to issue shutdown RPC after " + Duration.ofMillis(timeout),
|
|
||||||
-1, result);
|
|
||||||
});
|
|
||||||
|
|
||||||
masterThread.start();
|
masterThread.start();
|
||||||
shutdownFuture.join();
|
// Switching to master registry exacerbated a race in the master bootstrap that can result
|
||||||
|
// in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
|
||||||
|
// the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
|
||||||
|
// made to the master. The suspected reason as to why it was uncommon before HBASE-18095
|
||||||
|
// is because the connection creation with ZK registry is so slow that by then the server
|
||||||
|
// manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
|
||||||
|
// wait() in the test, waiting for the server manager to become available.
|
||||||
|
final long timeout = TimeUnit.MINUTES.toMillis(10);
|
||||||
|
assertNotEquals("Timeout waiting for server manager to become available.",
|
||||||
|
-1, Waiter.waitFor(htu.getConfiguration(), timeout,
|
||||||
|
() -> masterThread.getMaster().getServerManager() != null));
|
||||||
|
htu.getConnection().getAdmin().shutdown();
|
||||||
masterThread.join();
|
masterThread.join();
|
||||||
} finally {
|
} finally {
|
||||||
if (hbaseCluster != null) {
|
if (hbaseCluster != null) {
|
||||||
|
|
Loading…
Reference in New Issue