Merge pull request #133 from infraio/retry-backoff
HBASE-22193 Add backoff when region failed open too many times
This commit is contained in:
commit
249ac58d4f
|
@ -131,6 +131,10 @@ public class AssignmentManager {
|
||||||
"hbase.assignment.maximum.attempts";
|
"hbase.assignment.maximum.attempts";
|
||||||
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
|
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
|
||||||
|
|
||||||
|
public static final String ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS =
|
||||||
|
"hbase.assignment.retry.immediately.maximum.attempts";
|
||||||
|
private static final int DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = 3;
|
||||||
|
|
||||||
/** Region in Transition metrics threshold time */
|
/** Region in Transition metrics threshold time */
|
||||||
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
|
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
|
||||||
"hbase.metrics.rit.stuck.warning.threshold";
|
"hbase.metrics.rit.stuck.warning.threshold";
|
||||||
|
@ -151,6 +155,7 @@ public class AssignmentManager {
|
||||||
private final int assignDispatchWaitQueueMaxSize;
|
private final int assignDispatchWaitQueueMaxSize;
|
||||||
private final int assignDispatchWaitMillis;
|
private final int assignDispatchWaitMillis;
|
||||||
private final int assignMaxAttempts;
|
private final int assignMaxAttempts;
|
||||||
|
private final int assignRetryImmediatelyMaxAttempts;
|
||||||
|
|
||||||
private final Object checkIfShouldMoveSystemRegionLock = new Object();
|
private final Object checkIfShouldMoveSystemRegionLock = new Object();
|
||||||
|
|
||||||
|
@ -179,6 +184,8 @@ public class AssignmentManager {
|
||||||
|
|
||||||
this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
|
this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
|
||||||
DEFAULT_ASSIGN_MAX_ATTEMPTS));
|
DEFAULT_ASSIGN_MAX_ATTEMPTS));
|
||||||
|
this.assignRetryImmediatelyMaxAttempts = conf.getInt(ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS,
|
||||||
|
DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS);
|
||||||
|
|
||||||
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
|
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
|
||||||
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
|
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
|
||||||
|
@ -308,6 +315,10 @@ public class AssignmentManager {
|
||||||
return assignMaxAttempts;
|
return assignMaxAttempts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int getAssignRetryImmediatelyMaxAttempts() {
|
||||||
|
return assignRetryImmediatelyMaxAttempts;
|
||||||
|
}
|
||||||
|
|
||||||
public RegionStates getRegionStates() {
|
public RegionStates getRegionStates() {
|
||||||
return regionStates;
|
return regionStates;
|
||||||
}
|
}
|
||||||
|
|
|
@ -226,20 +226,32 @@ public class TransitRegionStateProcedure
|
||||||
return Flow.HAS_MORE_STATE;
|
return Flow.HAS_MORE_STATE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (incrementAndCheckMaxAttempts(env, regionNode)) {
|
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
|
||||||
|
.incrementAndGetRetries();
|
||||||
|
int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
|
||||||
|
LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
|
||||||
|
|
||||||
|
if (retries >= maxAttempts) {
|
||||||
env.getAssignmentManager().regionFailedOpen(regionNode, true);
|
env.getAssignmentManager().regionFailedOpen(regionNode, true);
|
||||||
setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
|
setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
|
||||||
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
|
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
|
||||||
regionNode.unsetProcedure(this);
|
regionNode.unsetProcedure(this);
|
||||||
return Flow.NO_MORE_STATE;
|
return Flow.NO_MORE_STATE;
|
||||||
}
|
}
|
||||||
|
|
||||||
env.getAssignmentManager().regionFailedOpen(regionNode, false);
|
env.getAssignmentManager().regionFailedOpen(regionNode, false);
|
||||||
// we failed to assign the region, force a new plan
|
// we failed to assign the region, force a new plan
|
||||||
forceNewPlan = true;
|
forceNewPlan = true;
|
||||||
regionNode.setRegionLocation(null);
|
regionNode.setRegionLocation(null);
|
||||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||||
// Here we do not throw exception because we want to the region to be online ASAP
|
|
||||||
return Flow.HAS_MORE_STATE;
|
if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
|
||||||
|
// Throw exception to backoff and retry when failed open too many times
|
||||||
|
throw new HBaseIOException("Failed to open region");
|
||||||
|
} else {
|
||||||
|
// Here we do not throw exception because we want to the region to be online ASAP
|
||||||
|
return Flow.HAS_MORE_STATE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
|
private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
|
||||||
|
@ -400,14 +412,6 @@ public class TransitRegionStateProcedure
|
||||||
this.remoteProc = null;
|
this.remoteProc = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, RegionStateNode regionNode) {
|
|
||||||
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
|
|
||||||
.incrementAndGetRetries();
|
|
||||||
int max = env.getAssignmentManager().getAssignMaxAttempts();
|
|
||||||
LOG.info("Retry={} of max={}; {}; {}", retries, max, this, regionNode.toShortString());
|
|
||||||
return retries >= max;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
|
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
|
|
Loading…
Reference in New Issue