Revert "master hangs forever if RecoverMeta send assign meta region request to target server fail"
Reverting because missing issue JIRA #
This reverts commit d3aeaeffa4
.
This commit is contained in:
parent
d3aeaeffa4
commit
9ef115163b
|
@ -572,10 +572,6 @@ public class ServerManager {
|
||||||
if (!master.isServerCrashProcessingEnabled()) {
|
if (!master.isServerCrashProcessingEnabled()) {
|
||||||
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
||||||
+ "delay expiring server " + serverName);
|
+ "delay expiring server " + serverName);
|
||||||
// Even we delay expire this server, we still need to handle Meta's RIT
|
|
||||||
// that are against the crashed server; since when we do RecoverMetaProcedure,
|
|
||||||
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
|
|
||||||
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
|
|
||||||
this.queuedDeadServers.add(serverName);
|
this.queuedDeadServers.add(serverName);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,7 +48,6 @@ import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.YouAreDeadException;
|
import org.apache.hadoop.hbase.YouAreDeadException;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
|
||||||
import org.apache.hadoop.hbase.client.TableState;
|
import org.apache.hadoop.hbase.client.TableState;
|
||||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||||
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
||||||
|
@ -71,7 +70,6 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
|
|
||||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||||
|
@ -80,7 +78,6 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore;
|
||||||
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
|
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
|
||||||
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
|
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||||
|
@ -1325,7 +1322,7 @@ public class AssignmentManager implements ServerListener {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
||||||
boolean carryingMeta = isCarryingMeta(serverName);
|
boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName);
|
||||||
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
||||||
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
|
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
|
||||||
shouldSplitWal, carryingMeta));
|
shouldSplitWal, carryingMeta));
|
||||||
|
@ -1856,31 +1853,4 @@ public class AssignmentManager implements ServerListener {
|
||||||
}*/
|
}*/
|
||||||
master.getServerManager().expireServer(serverNode.getServerName());
|
master.getServerManager().expireServer(serverNode.getServerName());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Handle RIT of meta region against crashed server
|
|
||||||
* Only used when ServerCrashProcedure is not enabled.
|
|
||||||
*
|
|
||||||
* @param serverName Server that has already crashed
|
|
||||||
*/
|
|
||||||
public void handleMetaRITOnCrashedServer(ServerName serverName) {
|
|
||||||
RegionInfo hri = RegionReplicaUtil
|
|
||||||
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
|
|
||||||
RegionInfo.DEFAULT_REPLICA_ID);
|
|
||||||
RegionState regionStateNode = getRegionStates().getRegionState(hri);
|
|
||||||
if (!regionStateNode.getServerName().equals(serverName)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// meta has been assigned to crashed server.
|
|
||||||
LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign");
|
|
||||||
// handle failure and wake event
|
|
||||||
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
|
|
||||||
// Not need to consider for REGION_TRANSITION_QUEUE step
|
|
||||||
if (rtp != null && rtp.isMeta()
|
|
||||||
&& rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
|
|
||||||
LOG.info("Re-do rit procedure: " + rtp.toString());
|
|
||||||
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
|
|
||||||
new ServerCrashException(rtp.getProcId(), serverName));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -212,14 +212,9 @@ public class MockNoopMasterServices implements MasterServices, Server {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean serverCrashProcessingEnabled = true;
|
|
||||||
|
|
||||||
public void setServerCrashProcessingEnabled(boolean b) {
|
|
||||||
serverCrashProcessingEnabled = b;
|
|
||||||
}
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isServerCrashProcessingEnabled() {
|
public boolean isServerCrashProcessingEnabled() {
|
||||||
return serverCrashProcessingEnabled;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -21,7 +21,6 @@ import static org.mockito.ArgumentMatchers.any;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NavigableMap;
|
import java.util.NavigableMap;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
|
@ -174,30 +173,6 @@ public class MockMasterServices extends MockNoopMasterServices {
|
||||||
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
|
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Call this restart method only after running MockMasterServices#start()
|
|
||||||
* The RSs can be differentiated by the port number, see
|
|
||||||
* ServerName in MockMasterServices#start() method above.
|
|
||||||
* Restart of region server will have new startcode in server name
|
|
||||||
*
|
|
||||||
* @param serverName Server name to be restarted
|
|
||||||
*/
|
|
||||||
public void restartRegionServer(ServerName serverName) throws IOException {
|
|
||||||
List<ServerName> onlineServers = serverManager.getOnlineServersList();
|
|
||||||
long startCode = -1;
|
|
||||||
for (ServerName s : onlineServers) {
|
|
||||||
if (s.getAddress().equals(serverName.getAddress())) {
|
|
||||||
startCode = s.getStartcode() + 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (startCode == -1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
|
|
||||||
serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stop(String why) {
|
public void stop(String why) {
|
||||||
stopProcedureExecutor();
|
stopProcedureExecutor();
|
||||||
|
|
|
@ -444,34 +444,6 @@ public class TestAssignmentManager {
|
||||||
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
|
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* It is possible that when AM send assign meta request to a RS successfully,
|
|
||||||
* but RS can not send back any response, which cause master startup hangs forever
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testAssignMetaAndCrashBeforeResponse() throws Exception {
|
|
||||||
tearDown();
|
|
||||||
// See setUp(), start HBase until set up meta
|
|
||||||
UTIL = new HBaseTestingUtility();
|
|
||||||
this.executor = Executors.newSingleThreadScheduledExecutor();
|
|
||||||
setupConfiguration(UTIL.getConfiguration());
|
|
||||||
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
|
|
||||||
rsDispatcher = new MockRSProcedureDispatcher(master);
|
|
||||||
master.start(NSERVERS, rsDispatcher);
|
|
||||||
am = master.getAssignmentManager();
|
|
||||||
|
|
||||||
// Assign meta
|
|
||||||
master.setServerCrashProcessingEnabled(false);
|
|
||||||
rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
|
|
||||||
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
|
|
||||||
assertEquals(true, am.isMetaInitialized());
|
|
||||||
|
|
||||||
// set it back as default, see setUpMeta()
|
|
||||||
master.setServerCrashProcessingEnabled(true);
|
|
||||||
am.wakeMetaLoadedEvent();
|
|
||||||
am.setFailoverCleanupDone(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Future<byte[]> submitProcedure(final Procedure proc) {
|
private Future<byte[]> submitProcedure(final Procedure proc) {
|
||||||
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
||||||
}
|
}
|
||||||
|
@ -555,14 +527,6 @@ public class TestAssignmentManager {
|
||||||
this.am.submitServerCrash(serverName, false/*No WALs here*/);
|
this.am.submitServerCrash(serverName, false/*No WALs here*/);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doRestart(final ServerName serverName) {
|
|
||||||
try {
|
|
||||||
this.master.restartRegionServer(serverName);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.warn("Can not restart RS with new startcode");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private class NoopRsExecutor implements MockRSExecutor {
|
private class NoopRsExecutor implements MockRSExecutor {
|
||||||
public ExecuteProceduresResponse sendRequest(ServerName server,
|
public ExecuteProceduresResponse sendRequest(ServerName server,
|
||||||
ExecuteProceduresRequest request) throws IOException {
|
ExecuteProceduresRequest request) throws IOException {
|
||||||
|
@ -714,37 +678,6 @@ public class TestAssignmentManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes open request and then returns nothing so acts like a RS that went zombie.
|
|
||||||
* No response (so proc is stuck/suspended on the Master and won't wake up.).
|
|
||||||
* Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
|
|
||||||
* ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
|
|
||||||
* will restart RS directly, situation for RS crashed when SCP is not enabled.
|
|
||||||
*/
|
|
||||||
private class HangThenRSRestartExecutor extends GoodRsExecutor {
|
|
||||||
private int invocations;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
|
||||||
throws IOException {
|
|
||||||
if (this.invocations++ > 0) {
|
|
||||||
// Return w/o problem the second time through here.
|
|
||||||
return super.execOpenRegion(server, openReq);
|
|
||||||
}
|
|
||||||
// The procedure on master will just hang forever because nothing comes back
|
|
||||||
// from the RS in this case.
|
|
||||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
|
||||||
executor.schedule(new Runnable() {
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
LOG.info("Restarting RS of " + server);
|
|
||||||
doRestart(server);
|
|
||||||
}
|
|
||||||
}, 1, TimeUnit.SECONDS);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
||||||
public static final int TYPES_OF_FAILURE = 6;
|
public static final int TYPES_OF_FAILURE = 6;
|
||||||
private int invocations;
|
private int invocations;
|
||||||
|
|
Loading…
Reference in New Issue