HBASE-19287 Revert "Revert "master hangs forever if RecoverMeta

send assign meta region request to target server fail""

This is a revert of a revert; i.e a reapplication with the
log message fixed up and some added javadoc.

This reverts commit 9ef115163b.

Signed-off-by: Yi Liang <yliang@us.ibm.com>
This commit is contained in:
Michael Stack 2017-12-13 23:07:04 -08:00
parent 211f231b4a
commit cb4bbea0f1
5 changed files with 134 additions and 2 deletions

View File

@ -572,6 +572,10 @@ public class ServerManager {
if (!master.isServerCrashProcessingEnabled()) { if (!master.isServerCrashProcessingEnabled()) {
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
+ "delay expiring server " + serverName); + "delay expiring server " + serverName);
// Even we delay expire this server, we still need to handle Meta's RIT
// that are against the crashed server; since when we do RecoverMetaProcedure,
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
this.queuedDeadServers.add(serverName); this.queuedDeadServers.add(serverName);
return; return;
} }

View File

@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
import org.apache.hadoop.hbase.favored.FavoredNodesManager; import org.apache.hadoop.hbase.favored.FavoredNodesManager;
@ -70,6 +71,7 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler; import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent; import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
@ -78,6 +80,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore;
import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.procedure2.util.StringUtils;
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
@ -1322,7 +1325,7 @@ public class AssignmentManager implements ServerListener {
} }
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName); boolean carryingMeta = isCarryingMeta(serverName);
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor(); ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName, procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
shouldSplitWal, carryingMeta)); shouldSplitWal, carryingMeta));
@ -1853,4 +1856,32 @@ public class AssignmentManager implements ServerListener {
}*/ }*/
master.getServerManager().expireServer(serverNode.getServerName()); master.getServerManager().expireServer(serverNode.getServerName());
} }
/**
* Handle RIT of meta region against crashed server.
* Only used when ServerCrashProcedure is not enabled.
* See handleRIT in ServerCrashProcedure for similar function.
*
* @param serverName Server that has already crashed
*/
public void handleMetaRITOnCrashedServer(ServerName serverName) {
RegionInfo hri = RegionReplicaUtil
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
RegionInfo.DEFAULT_REPLICA_ID);
RegionState regionStateNode = getRegionStates().getRegionState(hri);
if (!regionStateNode.getServerName().equals(serverName)) {
return;
}
// meta has been assigned to crashed server.
LOG.info("Meta assigned to crashed " + serverName + "; reassigning...");
// Handle failure and wake event
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
// Do not need to consider for REGION_TRANSITION_QUEUE step
if (rtp != null && rtp.isMeta() &&
rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
LOG.debug("Failing " + rtp.toString());
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
new ServerCrashException(rtp.getProcId(), serverName));
}
}
} }

View File

@ -212,9 +212,14 @@ public class MockNoopMasterServices implements MasterServices, Server {
return null; return null;
} }
private boolean serverCrashProcessingEnabled = true;
public void setServerCrashProcessingEnabled(boolean b) {
serverCrashProcessingEnabled = b;
}
@Override @Override
public boolean isServerCrashProcessingEnabled() { public boolean isServerCrashProcessingEnabled() {
return true; return serverCrashProcessingEnabled;
} }
@Override @Override

View File

@ -21,6 +21,7 @@ import static org.mockito.ArgumentMatchers.any;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NavigableMap; import java.util.NavigableMap;
import java.util.SortedSet; import java.util.SortedSet;
@ -173,6 +174,30 @@ public class MockMasterServices extends MockNoopMasterServices {
this.procedureExecutor.getEnvironment().setEventReady(initialized, true); this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
} }
/**
* Call this restart method only after running MockMasterServices#start()
* The RSs can be differentiated by the port number, see
* ServerName in MockMasterServices#start() method above.
* Restart of region server will have new startcode in server name
*
* @param serverName Server name to be restarted
*/
public void restartRegionServer(ServerName serverName) throws IOException {
List<ServerName> onlineServers = serverManager.getOnlineServersList();
long startCode = -1;
for (ServerName s : onlineServers) {
if (s.getAddress().equals(serverName.getAddress())) {
startCode = s.getStartcode() + 1;
break;
}
}
if (startCode == -1) {
return;
}
ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
}
@Override @Override
public void stop(String why) { public void stop(String why) {
stopProcedureExecutor(); stopProcedureExecutor();

View File

@ -444,6 +444,34 @@ public class TestAssignmentManager {
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount()); assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
} }
/**
* It is possible that when AM send assign meta request to a RS successfully,
* but RS can not send back any response, which cause master startup hangs forever
*/
@Test
public void testAssignMetaAndCrashBeforeResponse() throws Exception {
tearDown();
// See setUp(), start HBase until set up meta
UTIL = new HBaseTestingUtility();
this.executor = Executors.newSingleThreadScheduledExecutor();
setupConfiguration(UTIL.getConfiguration());
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
rsDispatcher = new MockRSProcedureDispatcher(master);
master.start(NSERVERS, rsDispatcher);
am = master.getAssignmentManager();
// Assign meta
master.setServerCrashProcessingEnabled(false);
rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
assertEquals(true, am.isMetaInitialized());
// set it back as default, see setUpMeta()
master.setServerCrashProcessingEnabled(true);
am.wakeMetaLoadedEvent();
am.setFailoverCleanupDone(true);
}
private Future<byte[]> submitProcedure(final Procedure proc) { private Future<byte[]> submitProcedure(final Procedure proc) {
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
} }
@ -527,6 +555,14 @@ public class TestAssignmentManager {
this.am.submitServerCrash(serverName, false/*No WALs here*/); this.am.submitServerCrash(serverName, false/*No WALs here*/);
} }
private void doRestart(final ServerName serverName) {
try {
this.master.restartRegionServer(serverName);
} catch (IOException e) {
LOG.warn("Can not restart RS with new startcode");
}
}
private class NoopRsExecutor implements MockRSExecutor { private class NoopRsExecutor implements MockRSExecutor {
public ExecuteProceduresResponse sendRequest(ServerName server, public ExecuteProceduresResponse sendRequest(ServerName server,
ExecuteProceduresRequest request) throws IOException { ExecuteProceduresRequest request) throws IOException {
@ -678,6 +714,37 @@ public class TestAssignmentManager {
} }
} }
/**
* Takes open request and then returns nothing so acts like a RS that went zombie.
* No response (so proc is stuck/suspended on the Master and won't wake up.).
* Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
* ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
* will restart RS directly, situation for RS crashed when SCP is not enabled.
*/
private class HangThenRSRestartExecutor extends GoodRsExecutor {
private int invocations;
@Override
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
throws IOException {
if (this.invocations++ > 0) {
// Return w/o problem the second time through here.
return super.execOpenRegion(server, openReq);
}
// The procedure on master will just hang forever because nothing comes back
// from the RS in this case.
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
executor.schedule(new Runnable() {
@Override
public void run() {
LOG.info("Restarting RS of " + server);
doRestart(server);
}
}, 1, TimeUnit.SECONDS);
return null;
}
}
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor { private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
public static final int TYPES_OF_FAILURE = 6; public static final int TYPES_OF_FAILURE = 6;
private int invocations; private int invocations;