HBASE-21095 The timeout retry logic for several procedures are broken after master restarts
This commit is contained in:
parent
780670ede1
commit
aac1a70147
|
@ -1414,12 +1414,25 @@ public class AssignmentManager implements ServerListener {
|
|||
// Region Status update
|
||||
// Should only be called in TransitRegionStateProcedure
|
||||
// ============================================================================================
|
||||
private void transitStateAndUpdate(RegionStateNode regionNode, RegionState.State newState,
|
||||
RegionState.State... expectedStates) throws IOException {
|
||||
RegionState.State state = regionNode.getState();
|
||||
regionNode.transitionState(newState, expectedStates);
|
||||
boolean succ = false;
|
||||
try {
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
succ = true;
|
||||
} finally {
|
||||
if (!succ) {
|
||||
// revert
|
||||
regionNode.setState(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should be called within the synchronized block of RegionStateNode
|
||||
void regionOpening(RegionStateNode regionNode) throws IOException {
|
||||
regionNode.transitionState(State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN);
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
|
||||
transitStateAndUpdate(regionNode, State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN);
|
||||
regionStates.addRegionToServer(regionNode);
|
||||
// update the operation count metrics
|
||||
metrics.incrementOperationCounter();
|
||||
|
@ -1429,23 +1442,33 @@ public class AssignmentManager implements ServerListener {
|
|||
// The parameter 'giveUp' means whether we will try to open the region again, if it is true, then
|
||||
// we will persist the FAILED_OPEN state into hbase:meta.
|
||||
void regionFailedOpen(RegionStateNode regionNode, boolean giveUp) throws IOException {
|
||||
if (regionNode.getRegionLocation() != null) {
|
||||
regionStates.removeRegionFromServer(regionNode.getRegionLocation(), regionNode);
|
||||
}
|
||||
RegionState.State state = regionNode.getState();
|
||||
ServerName regionLocation = regionNode.getRegionLocation();
|
||||
if (giveUp) {
|
||||
regionNode.setState(State.FAILED_OPEN);
|
||||
regionNode.setRegionLocation(null);
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
boolean succ = false;
|
||||
try {
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
succ = true;
|
||||
} finally {
|
||||
if (!succ) {
|
||||
// revert
|
||||
regionNode.setState(state);
|
||||
regionNode.setRegionLocation(regionLocation);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (regionLocation != null) {
|
||||
regionStates.removeRegionFromServer(regionLocation, regionNode);
|
||||
}
|
||||
}
|
||||
|
||||
// should be called within the synchronized block of RegionStateNode
|
||||
void regionOpened(RegionStateNode regionNode) throws IOException {
|
||||
regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
|
||||
// TODO: OPENING Updates hbase:meta too... we need to do both here and there?
|
||||
// That is a lot of hbase:meta writing.
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
|
||||
transitStateAndUpdate(regionNode, State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
|
||||
RegionInfo hri = regionNode.getRegionInfo();
|
||||
if (isMetaRegion(hri)) {
|
||||
// Usually we'd set a table ENABLED at this stage but hbase:meta is ALWAYs enabled, it
|
||||
|
@ -1460,7 +1483,7 @@ public class AssignmentManager implements ServerListener {
|
|||
|
||||
// should be called within the synchronized block of RegionStateNode
|
||||
void regionClosing(RegionStateNode regionNode) throws IOException {
|
||||
regionNode.transitionState(State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE);
|
||||
transitStateAndUpdate(regionNode, State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE);
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
|
||||
RegionInfo hri = regionNode.getRegionInfo();
|
||||
|
@ -1477,16 +1500,26 @@ public class AssignmentManager implements ServerListener {
|
|||
// The parameter 'normally' means whether we are closed cleanly, if it is true, then it means that
|
||||
// we are closed due to a RS crash.
|
||||
void regionClosed(RegionStateNode regionNode, boolean normally) throws IOException {
|
||||
RegionState.State state = regionNode.getState();
|
||||
ServerName regionLocation = regionNode.getRegionLocation();
|
||||
regionNode.transitionState(normally ? State.CLOSED : State.ABNORMALLY_CLOSED,
|
||||
RegionStates.STATES_EXPECTED_ON_CLOSE);
|
||||
ServerName loc = regionNode.getRegionLocation();
|
||||
if (loc != null) {
|
||||
// could be a retry so add a check here to avoid set the lastHost to null.
|
||||
regionNode.setLastHost(loc);
|
||||
regionNode.setRegionLocation(null);
|
||||
regionStates.removeRegionFromServer(loc, regionNode);
|
||||
boolean succ = false;
|
||||
try {
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
succ = true;
|
||||
} finally {
|
||||
if (!succ) {
|
||||
// revert
|
||||
regionNode.setState(state);
|
||||
regionNode.setRegionLocation(regionLocation);
|
||||
}
|
||||
}
|
||||
if (regionLocation != null) {
|
||||
regionNode.setLastHost(regionLocation);
|
||||
regionNode.setRegionLocation(null);
|
||||
regionStates.removeRegionFromServer(regionLocation, regionNode);
|
||||
}
|
||||
regionStateStore.updateRegionLocation(regionNode);
|
||||
}
|
||||
|
||||
public void markRegionAsSplit(final RegionInfo parent, final ServerName serverName,
|
||||
|
|
|
@ -326,13 +326,9 @@ public class TransitRegionStateProcedure
|
|||
"Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
|
||||
"by other Procedure or operator intervention",
|
||||
backoff / 1000, this, regionNode.toShortString(), e);
|
||||
regionNode.getProcedureEvent().suspend();
|
||||
if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
|
||||
setTimeout(Math.toIntExact(backoff));
|
||||
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
return Flow.HAS_MORE_STATE;
|
||||
setTimeout(Math.toIntExact(backoff));
|
||||
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -342,7 +338,7 @@ public class TransitRegionStateProcedure
|
|||
@Override
|
||||
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
|
||||
setState(ProcedureProtos.ProcedureState.RUNNABLE);
|
||||
getRegionStateNode(env).getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
env.getProcedureScheduler().addFront(this);
|
||||
return false; // 'false' means that this procedure handled the timeout
|
||||
}
|
||||
|
||||
|
|
|
@ -114,6 +114,17 @@ public class ServerCrashProcedure
|
|||
notifiedDeadServer = true;
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case SERVER_CRASH_START:
|
||||
case SERVER_CRASH_SPLIT_META_LOGS:
|
||||
case SERVER_CRASH_ASSIGN_META:
|
||||
break;
|
||||
default:
|
||||
// If hbase:meta is not assigned, yield.
|
||||
if (env.getAssignmentManager().waitMetaLoaded(this)) {
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
}
|
||||
try {
|
||||
switch (state) {
|
||||
case SERVER_CRASH_START:
|
||||
|
@ -134,10 +145,6 @@ public class ServerCrashProcedure
|
|||
setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
|
||||
break;
|
||||
case SERVER_CRASH_GET_REGIONS:
|
||||
// If hbase:meta is not assigned, yield.
|
||||
if (env.getAssignmentManager().waitMetaLoaded(this)) {
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
this.regionsOnCrashedServer =
|
||||
services.getAssignmentManager().getRegionStates().getServerRegionInfoSet(serverName);
|
||||
// Where to go next? Depends on whether we should split logs at all or
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.TableName;
|
|||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.ServerManager;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
|
||||
|
@ -149,6 +150,7 @@ public class TestCloseRegionWhileRSCrash {
|
|||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
|
||||
UTIL.startMiniCluster(3);
|
||||
UTIL.createTable(TABLE_NAME, CF);
|
||||
UTIL.getAdmin().balancerSwitch(false, true);
|
||||
|
@ -174,7 +176,7 @@ public class TestCloseRegionWhileRSCrash {
|
|||
HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
|
||||
ProcedureExecutor<MasterProcedureEnv> procExec =
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
|
||||
procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
|
||||
long dummyProcId = procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
|
||||
ARRIVE.await();
|
||||
UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName());
|
||||
UTIL.waitFor(30000,
|
||||
|
@ -206,7 +208,14 @@ public class TestCloseRegionWhileRSCrash {
|
|||
}
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
// let's close the connection to make sure that the SCP can not update meta successfully
|
||||
UTIL.getMiniHBaseCluster().getMaster().getConnection().close();
|
||||
RESUME.countDown();
|
||||
UTIL.waitFor(30000, () -> procExec.isFinished(dummyProcId));
|
||||
Thread.sleep(2000);
|
||||
// here we restart
|
||||
UTIL.getMiniHBaseCluster().stopMaster(0).join();
|
||||
UTIL.getMiniHBaseCluster().startMaster();
|
||||
t.join();
|
||||
// Make sure that the region is online, it may not on the original target server, as we will set
|
||||
// forceNewPlan to true if there is a server crash
|
||||
|
|
Loading…
Reference in New Issue