master hangs forever if RecoverMeta send assign meta region request to target server fail
This commit is contained in:
parent
ba5f9ac380
commit
d3aeaeffa4
|
@ -572,6 +572,10 @@ public class ServerManager {
|
|||
if (!master.isServerCrashProcessingEnabled()) {
|
||||
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
||||
+ "delay expiring server " + serverName);
|
||||
// Even we delay expire this server, we still need to handle Meta's RIT
|
||||
// that are against the crashed server; since when we do RecoverMetaProcedure,
|
||||
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
|
||||
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
|
||||
this.queuedDeadServers.add(serverName);
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.TableName;
|
|||
import org.apache.hadoop.hbase.YouAreDeadException;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.TableState;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
||||
|
@ -70,6 +71,7 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer;
|
|||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||
|
@ -78,6 +80,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore;
|
|||
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
|
||||
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
|
@ -1322,7 +1325,7 @@ public class AssignmentManager implements ServerListener {
|
|||
}
|
||||
|
||||
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
||||
boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName);
|
||||
boolean carryingMeta = isCarryingMeta(serverName);
|
||||
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
||||
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
|
||||
shouldSplitWal, carryingMeta));
|
||||
|
@ -1853,4 +1856,31 @@ public class AssignmentManager implements ServerListener {
|
|||
}*/
|
||||
master.getServerManager().expireServer(serverNode.getServerName());
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle RIT of meta region against crashed server
|
||||
* Only used when ServerCrashProcedure is not enabled.
|
||||
*
|
||||
* @param serverName Server that has already crashed
|
||||
*/
|
||||
public void handleMetaRITOnCrashedServer(ServerName serverName) {
|
||||
RegionInfo hri = RegionReplicaUtil
|
||||
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
|
||||
RegionInfo.DEFAULT_REPLICA_ID);
|
||||
RegionState regionStateNode = getRegionStates().getRegionState(hri);
|
||||
if (!regionStateNode.getServerName().equals(serverName)) {
|
||||
return;
|
||||
}
|
||||
// meta has been assigned to crashed server.
|
||||
LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign");
|
||||
// handle failure and wake event
|
||||
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
|
||||
// Not need to consider for REGION_TRANSITION_QUEUE step
|
||||
if (rtp != null && rtp.isMeta()
|
||||
&& rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
|
||||
LOG.info("Re-do rit procedure: " + rtp.toString());
|
||||
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
|
||||
new ServerCrashException(rtp.getProcId(), serverName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -212,9 +212,14 @@ public class MockNoopMasterServices implements MasterServices, Server {
|
|||
return null;
|
||||
}
|
||||
|
||||
private boolean serverCrashProcessingEnabled = true;
|
||||
|
||||
public void setServerCrashProcessingEnabled(boolean b) {
|
||||
serverCrashProcessingEnabled = b;
|
||||
}
|
||||
@Override
|
||||
public boolean isServerCrashProcessingEnabled() {
|
||||
return true;
|
||||
return serverCrashProcessingEnabled;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -21,6 +21,7 @@ import static org.mockito.ArgumentMatchers.any;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.SortedSet;
|
||||
|
@ -173,6 +174,30 @@ public class MockMasterServices extends MockNoopMasterServices {
|
|||
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Call this restart method only after running MockMasterServices#start()
|
||||
* The RSs can be differentiated by the port number, see
|
||||
* ServerName in MockMasterServices#start() method above.
|
||||
* Restart of region server will have new startcode in server name
|
||||
*
|
||||
* @param serverName Server name to be restarted
|
||||
*/
|
||||
public void restartRegionServer(ServerName serverName) throws IOException {
|
||||
List<ServerName> onlineServers = serverManager.getOnlineServersList();
|
||||
long startCode = -1;
|
||||
for (ServerName s : onlineServers) {
|
||||
if (s.getAddress().equals(serverName.getAddress())) {
|
||||
startCode = s.getStartcode() + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (startCode == -1) {
|
||||
return;
|
||||
}
|
||||
ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
|
||||
serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop(String why) {
|
||||
stopProcedureExecutor();
|
||||
|
|
|
@ -444,6 +444,34 @@ public class TestAssignmentManager {
|
|||
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* It is possible that when AM send assign meta request to a RS successfully,
|
||||
* but RS can not send back any response, which cause master startup hangs forever
|
||||
*/
|
||||
@Test
|
||||
public void testAssignMetaAndCrashBeforeResponse() throws Exception {
|
||||
tearDown();
|
||||
// See setUp(), start HBase until set up meta
|
||||
UTIL = new HBaseTestingUtility();
|
||||
this.executor = Executors.newSingleThreadScheduledExecutor();
|
||||
setupConfiguration(UTIL.getConfiguration());
|
||||
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
|
||||
rsDispatcher = new MockRSProcedureDispatcher(master);
|
||||
master.start(NSERVERS, rsDispatcher);
|
||||
am = master.getAssignmentManager();
|
||||
|
||||
// Assign meta
|
||||
master.setServerCrashProcessingEnabled(false);
|
||||
rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
|
||||
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
|
||||
assertEquals(true, am.isMetaInitialized());
|
||||
|
||||
// set it back as default, see setUpMeta()
|
||||
master.setServerCrashProcessingEnabled(true);
|
||||
am.wakeMetaLoadedEvent();
|
||||
am.setFailoverCleanupDone(true);
|
||||
}
|
||||
|
||||
private Future<byte[]> submitProcedure(final Procedure proc) {
|
||||
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
||||
}
|
||||
|
@ -527,6 +555,14 @@ public class TestAssignmentManager {
|
|||
this.am.submitServerCrash(serverName, false/*No WALs here*/);
|
||||
}
|
||||
|
||||
private void doRestart(final ServerName serverName) {
|
||||
try {
|
||||
this.master.restartRegionServer(serverName);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Can not restart RS with new startcode");
|
||||
}
|
||||
}
|
||||
|
||||
private class NoopRsExecutor implements MockRSExecutor {
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server,
|
||||
ExecuteProceduresRequest request) throws IOException {
|
||||
|
@ -678,6 +714,37 @@ public class TestAssignmentManager {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes open request and then returns nothing so acts like a RS that went zombie.
|
||||
* No response (so proc is stuck/suspended on the Master and won't wake up.).
|
||||
* Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
|
||||
* ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
|
||||
* will restart RS directly, situation for RS crashed when SCP is not enabled.
|
||||
*/
|
||||
private class HangThenRSRestartExecutor extends GoodRsExecutor {
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
if (this.invocations++ > 0) {
|
||||
// Return w/o problem the second time through here.
|
||||
return super.execOpenRegion(server, openReq);
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Restarting RS of " + server);
|
||||
doRestart(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
||||
public static final int TYPES_OF_FAILURE = 6;
|
||||
private int invocations;
|
||||
|
|
Loading…
Reference in New Issue