HBASE-12464 meta table region assignment stuck in the FAILED_OPEN state due to region server not fully ready to serve (Stephen Jiang)

This commit is contained in:
tedyu 2014-11-20 12:01:40 -08:00
parent 84ed9f6ba4
commit deacb117f6
1 changed files with 48 additions and 12 deletions

View File

@ -168,7 +168,7 @@ public class AssignmentManager extends ZooKeeperListener {
/** /**
* The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
* failure due to lack of availability of region plan * failure due to lack of availability of region plan or bad region plan
*/ */
private final long sleepTimeBeforeRetryingMetaAssignment; private final long sleepTimeBeforeRetryingMetaAssignment;
@ -1996,6 +1996,7 @@ public class AssignmentManager extends ZooKeeperListener {
+ ", the server is stopped/aborted"); + ", the server is stopped/aborted");
return; return;
} }
if (plan == null) { // Get a server for the region at first if (plan == null) { // Get a server for the region at first
try { try {
plan = getRegionPlan(region, forceNewPlan); plan = getRegionPlan(region, forceNewPlan);
@ -2003,18 +2004,23 @@ public class AssignmentManager extends ZooKeeperListener {
LOG.warn("Failed to get region plan", e); LOG.warn("Failed to get region plan", e);
} }
} }
if (plan == null) { if (plan == null) {
LOG.warn("Unable to determine a plan to assign " + region); LOG.warn("Unable to determine a plan to assign " + region);
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) { if (region.isMetaRegion()) {
try { if (i == maximumAttempts) {
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment); i = 0; // re-set attempt count to 0 for at least 1 retry
if (i == maximumAttempts) i = 1;
continue; LOG.warn("Unable to determine a plan to assign a hbase:meta region " + region +
} catch (InterruptedException e) { " after maximumAttempts (" + this.maximumAttempts +
LOG.error("Got exception while waiting for hbase:meta assignment"); "). Reset attempts count and continue retrying.");
Thread.currentThread().interrupt();
} }
waitForRetryingMetaAssignment();
continue;
} }
regionStates.updateRegionState(region, State.FAILED_OPEN); regionStates.updateRegionState(region, State.FAILED_OPEN);
return; return;
} }
@ -2148,9 +2154,19 @@ public class AssignmentManager extends ZooKeeperListener {
} }
if (i == this.maximumAttempts) { if (i == this.maximumAttempts) {
// Don't reset the region state or get a new plan any more. // For meta region, we have to keep retrying until succeeding
// This is the last try. if (region.isMetaRegion()) {
continue; i = 0; // re-set attempt count to 0 for at least 1 retry
LOG.warn(assignMsg +
", trying to assign a hbase:meta region reached to maximumAttempts (" +
this.maximumAttempts + "). Reset attempt counts and continue retrying.");
waitForRetryingMetaAssignment();
}
else {
// Don't reset the region state or get a new plan any more.
// This is the last try.
continue;
}
} }
// If region opened on destination of present plan, reassigning to new // If region opened on destination of present plan, reassigning to new
@ -2341,6 +2357,18 @@ public class AssignmentManager extends ZooKeeperListener {
return existingPlan; return existingPlan;
} }
/**
* Wait for some time before retrying meta table region assignment
*/
private void waitForRetryingMetaAssignment() {
try {
Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
} catch (InterruptedException e) {
LOG.error("Got exception while waiting for hbase:meta assignment");
Thread.currentThread().interrupt();
}
}
/** /**
* Unassigns the specified region. * Unassigns the specified region.
* <p> * <p>
@ -3397,12 +3425,20 @@ public class AssignmentManager extends ZooKeeperListener {
// name, and failedOpenTracker is updated only in this block // name, and failedOpenTracker is updated only in this block
failedOpenTracker.put(encodedName, failedOpenCount); failedOpenTracker.put(encodedName, failedOpenCount);
} }
if (failedOpenCount.incrementAndGet() >= maximumAttempts) { if (failedOpenCount.incrementAndGet() >= maximumAttempts && !hri.isMetaRegion()) {
regionStates.updateRegionState(hri, State.FAILED_OPEN); regionStates.updateRegionState(hri, State.FAILED_OPEN);
// remove the tracking info to save memory, also reset // remove the tracking info to save memory, also reset
// the count for next open initiative // the count for next open initiative
failedOpenTracker.remove(encodedName); failedOpenTracker.remove(encodedName);
} else { } else {
if (hri.isMetaRegion() && failedOpenCount.get() >= maximumAttempts) {
// Log a warning message if a meta region failedOpenCount exceeds maximumAttempts
// so that we are aware of potential problem if it persists for a long time.
LOG.warn("Failed to open the hbase:meta region " +
hri.getRegionNameAsString() + " after" +
failedOpenCount.get() + " retries. Continue retrying.");
}
// Handle this the same as if it were opened and then closed. // Handle this the same as if it were opened and then closed.
RegionState regionState = regionStates.updateRegionState(hri, State.CLOSED); RegionState regionState = regionStates.updateRegionState(hri, State.CLOSED);
if (regionState != null) { if (regionState != null) {