HBASE-12464 meta table region assignment stuck in the FAILED_OPEN state due to region server not fully ready to serve (Stephen Jiang)

This commit is contained in:
tedyu 2014-11-20 09:58:42 -08:00
parent e4a68606f5
commit 7eefd0cbed
1 changed files with 48 additions and 12 deletions

View File

@ -129,7 +129,7 @@ public class AssignmentManager {
/**
* The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
* failure due to lack of availability of region plan
* failure due to lack of availability of region plan or bad region plan
*/
private final long sleepTimeBeforeRetryingMetaAssignment;
@ -976,6 +976,7 @@ public class AssignmentManager {
+ ", the server is stopped/aborted");
return;
}
if (plan == null) { // Get a server for the region at first
try {
plan = getRegionPlan(region, forceNewPlan);
@ -983,18 +984,23 @@ public class AssignmentManager {
LOG.warn("Failed to get region plan", e);
}
}
if (plan == null) {
LOG.warn("Unable to determine a plan to assign " + region);
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
try {
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
if (i == maximumAttempts) i = 1;
continue;
} catch (InterruptedException e) {
LOG.error("Got exception while waiting for hbase:meta assignment");
Thread.currentThread().interrupt();
if (i == maximumAttempts) {
i = 0; // re-set attempt count to 0 for at least 1 retry
LOG.warn("Unable to determine a plan to assign a hbase:meta region " + region +
" after maximumAttempts (" + this.maximumAttempts +
"). Reset attempts count and continue retrying.");
}
waitForRetryingMetaAssignment();
continue;
}
regionStates.updateRegionState(region, State.FAILED_OPEN);
return;
}
@ -1084,9 +1090,19 @@ public class AssignmentManager {
}
if (i == this.maximumAttempts) {
// Don't reset the region state or get a new plan any more.
// This is the last try.
continue;
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
i = 0; // re-set attempt count to 0 for at least 1 retry
LOG.warn(assignMsg +
", trying to assign a hbase:meta region reached to maximumAttempts (" +
this.maximumAttempts + "). Reset attempt counts and continue retrying.");
waitForRetryingMetaAssignment();
}
else {
// Don't reset the region state or get a new plan any more.
// This is the last try.
continue;
}
}
// If region opened on destination of present plan, reassigning to new
@ -1226,6 +1242,18 @@ public class AssignmentManager {
return existingPlan;
}
/**
* Wait for some time before retrying meta table region assignment
*/
private void waitForRetryingMetaAssignment() {
try {
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
} catch (InterruptedException e) {
LOG.error("Got exception while waiting for hbase:meta assignment");
Thread.currentThread().interrupt();
}
}
/**
* Unassigns the specified region.
* <p>
@ -2077,12 +2105,20 @@ public class AssignmentManager {
// name, and failedOpenTracker is updated only in this block
failedOpenTracker.put(encodedName, failedOpenCount);
}
if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
if (failedOpenCount.incrementAndGet() >= maximumAttempts && !hri.isMetaRegion()) {
regionStates.updateRegionState(hri, State.FAILED_OPEN);
// remove the tracking info to save memory, also reset
// the count for next open initiative
failedOpenTracker.remove(encodedName);
} else {
if (hri.isMetaRegion() && failedOpenCount.get() >= maximumAttempts) {
// Log a warning message if a meta region failedOpenCount exceeds maximumAttempts
// so that we are aware of potential problem if it persists for a long time.
LOG.warn("Failed to open the hbase:meta region " +
hri.getRegionNameAsString() + " after" +
failedOpenCount.get() + " retries. Continue retrying.");
}
// Handle this the same as if it were opened and then closed.
RegionState regionState = regionStates.updateRegionState(hri, State.CLOSED);
if (regionState != null) {