master hangs forever if RecoverMeta send assign meta region request to target server fail
This commit is contained in:
parent
ba5f9ac380
commit
d3aeaeffa4
|
@ -572,6 +572,10 @@ public class ServerManager {
|
||||||
if (!master.isServerCrashProcessingEnabled()) {
|
if (!master.isServerCrashProcessingEnabled()) {
|
||||||
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
||||||
+ "delay expiring server " + serverName);
|
+ "delay expiring server " + serverName);
|
||||||
|
// Even we delay expire this server, we still need to handle Meta's RIT
|
||||||
|
// that are against the crashed server; since when we do RecoverMetaProcedure,
|
||||||
|
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
|
||||||
|
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
|
||||||
this.queuedDeadServers.add(serverName);
|
this.queuedDeadServers.add(serverName);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.YouAreDeadException;
|
import org.apache.hadoop.hbase.YouAreDeadException;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||||
|
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||||
import org.apache.hadoop.hbase.client.TableState;
|
import org.apache.hadoop.hbase.client.TableState;
|
||||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||||
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
||||||
|
@ -70,6 +71,7 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||||
|
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||||
|
@ -78,6 +80,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore;
|
||||||
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
|
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
|
||||||
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
|
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||||
|
@ -1322,7 +1325,7 @@ public class AssignmentManager implements ServerListener {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
||||||
boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName);
|
boolean carryingMeta = isCarryingMeta(serverName);
|
||||||
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
||||||
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
|
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
|
||||||
shouldSplitWal, carryingMeta));
|
shouldSplitWal, carryingMeta));
|
||||||
|
@ -1853,4 +1856,31 @@ public class AssignmentManager implements ServerListener {
|
||||||
}*/
|
}*/
|
||||||
master.getServerManager().expireServer(serverNode.getServerName());
|
master.getServerManager().expireServer(serverNode.getServerName());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle RIT of meta region against crashed server
|
||||||
|
* Only used when ServerCrashProcedure is not enabled.
|
||||||
|
*
|
||||||
|
* @param serverName Server that has already crashed
|
||||||
|
*/
|
||||||
|
public void handleMetaRITOnCrashedServer(ServerName serverName) {
|
||||||
|
RegionInfo hri = RegionReplicaUtil
|
||||||
|
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
|
||||||
|
RegionInfo.DEFAULT_REPLICA_ID);
|
||||||
|
RegionState regionStateNode = getRegionStates().getRegionState(hri);
|
||||||
|
if (!regionStateNode.getServerName().equals(serverName)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// meta has been assigned to crashed server.
|
||||||
|
LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign");
|
||||||
|
// handle failure and wake event
|
||||||
|
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
|
||||||
|
// Not need to consider for REGION_TRANSITION_QUEUE step
|
||||||
|
if (rtp != null && rtp.isMeta()
|
||||||
|
&& rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
|
||||||
|
LOG.info("Re-do rit procedure: " + rtp.toString());
|
||||||
|
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
|
||||||
|
new ServerCrashException(rtp.getProcId(), serverName));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -212,9 +212,14 @@ public class MockNoopMasterServices implements MasterServices, Server {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean serverCrashProcessingEnabled = true;
|
||||||
|
|
||||||
|
public void setServerCrashProcessingEnabled(boolean b) {
|
||||||
|
serverCrashProcessingEnabled = b;
|
||||||
|
}
|
||||||
@Override
|
@Override
|
||||||
public boolean isServerCrashProcessingEnabled() {
|
public boolean isServerCrashProcessingEnabled() {
|
||||||
return true;
|
return serverCrashProcessingEnabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -21,6 +21,7 @@ import static org.mockito.ArgumentMatchers.any;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NavigableMap;
|
import java.util.NavigableMap;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
|
@ -173,6 +174,30 @@ public class MockMasterServices extends MockNoopMasterServices {
|
||||||
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
|
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call this restart method only after running MockMasterServices#start()
|
||||||
|
* The RSs can be differentiated by the port number, see
|
||||||
|
* ServerName in MockMasterServices#start() method above.
|
||||||
|
* Restart of region server will have new startcode in server name
|
||||||
|
*
|
||||||
|
* @param serverName Server name to be restarted
|
||||||
|
*/
|
||||||
|
public void restartRegionServer(ServerName serverName) throws IOException {
|
||||||
|
List<ServerName> onlineServers = serverManager.getOnlineServersList();
|
||||||
|
long startCode = -1;
|
||||||
|
for (ServerName s : onlineServers) {
|
||||||
|
if (s.getAddress().equals(serverName.getAddress())) {
|
||||||
|
startCode = s.getStartcode() + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (startCode == -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
|
||||||
|
serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stop(String why) {
|
public void stop(String why) {
|
||||||
stopProcedureExecutor();
|
stopProcedureExecutor();
|
||||||
|
|
|
@ -444,6 +444,34 @@ public class TestAssignmentManager {
|
||||||
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
|
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* It is possible that when AM send assign meta request to a RS successfully,
|
||||||
|
* but RS can not send back any response, which cause master startup hangs forever
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testAssignMetaAndCrashBeforeResponse() throws Exception {
|
||||||
|
tearDown();
|
||||||
|
// See setUp(), start HBase until set up meta
|
||||||
|
UTIL = new HBaseTestingUtility();
|
||||||
|
this.executor = Executors.newSingleThreadScheduledExecutor();
|
||||||
|
setupConfiguration(UTIL.getConfiguration());
|
||||||
|
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
|
||||||
|
rsDispatcher = new MockRSProcedureDispatcher(master);
|
||||||
|
master.start(NSERVERS, rsDispatcher);
|
||||||
|
am = master.getAssignmentManager();
|
||||||
|
|
||||||
|
// Assign meta
|
||||||
|
master.setServerCrashProcessingEnabled(false);
|
||||||
|
rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
|
||||||
|
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
|
||||||
|
assertEquals(true, am.isMetaInitialized());
|
||||||
|
|
||||||
|
// set it back as default, see setUpMeta()
|
||||||
|
master.setServerCrashProcessingEnabled(true);
|
||||||
|
am.wakeMetaLoadedEvent();
|
||||||
|
am.setFailoverCleanupDone(true);
|
||||||
|
}
|
||||||
|
|
||||||
private Future<byte[]> submitProcedure(final Procedure proc) {
|
private Future<byte[]> submitProcedure(final Procedure proc) {
|
||||||
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
||||||
}
|
}
|
||||||
|
@ -527,6 +555,14 @@ public class TestAssignmentManager {
|
||||||
this.am.submitServerCrash(serverName, false/*No WALs here*/);
|
this.am.submitServerCrash(serverName, false/*No WALs here*/);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void doRestart(final ServerName serverName) {
|
||||||
|
try {
|
||||||
|
this.master.restartRegionServer(serverName);
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.warn("Can not restart RS with new startcode");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private class NoopRsExecutor implements MockRSExecutor {
|
private class NoopRsExecutor implements MockRSExecutor {
|
||||||
public ExecuteProceduresResponse sendRequest(ServerName server,
|
public ExecuteProceduresResponse sendRequest(ServerName server,
|
||||||
ExecuteProceduresRequest request) throws IOException {
|
ExecuteProceduresRequest request) throws IOException {
|
||||||
|
@ -678,6 +714,37 @@ public class TestAssignmentManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes open request and then returns nothing so acts like a RS that went zombie.
|
||||||
|
* No response (so proc is stuck/suspended on the Master and won't wake up.).
|
||||||
|
* Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
|
||||||
|
* ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
|
||||||
|
* will restart RS directly, situation for RS crashed when SCP is not enabled.
|
||||||
|
*/
|
||||||
|
private class HangThenRSRestartExecutor extends GoodRsExecutor {
|
||||||
|
private int invocations;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||||
|
throws IOException {
|
||||||
|
if (this.invocations++ > 0) {
|
||||||
|
// Return w/o problem the second time through here.
|
||||||
|
return super.execOpenRegion(server, openReq);
|
||||||
|
}
|
||||||
|
// The procedure on master will just hang forever because nothing comes back
|
||||||
|
// from the RS in this case.
|
||||||
|
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||||
|
executor.schedule(new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
LOG.info("Restarting RS of " + server);
|
||||||
|
doRestart(server);
|
||||||
|
}
|
||||||
|
}, 1, TimeUnit.SECONDS);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
||||||
public static final int TYPES_OF_FAILURE = 6;
|
public static final int TYPES_OF_FAILURE = 6;
|
||||||
private int invocations;
|
private int invocations;
|
||||||
|
|
Loading…
Reference in New Issue