HBASE-6881 All regionservers are marked offline even there is still one up

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1392467 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
jxiang 2012-10-01 17:45:04 +00:00
parent e9947aad05
commit c7b246edd2
1 changed files with 56 additions and 31 deletions

View File

@ -1399,13 +1399,15 @@ public class AssignmentManager extends ZooKeeperListener {
final boolean setOfflineInZK, final boolean forceNewPlan, final boolean setOfflineInZK, final boolean forceNewPlan,
boolean hijack) { boolean hijack) {
boolean regionAlreadyInTransitionException = false; boolean regionAlreadyInTransitionException = false;
boolean serverNotRunningYet = false;
RegionState currentState = state;
long maxRegionServerStartupWaitTime = -1;
for (int i = 0; i < this.maximumAssignmentAttempts; i++) { for (int i = 0; i < this.maximumAssignmentAttempts; i++) {
int versionOfOfflineNode = -1; int versionOfOfflineNode = -1;
if (setOfflineInZK) { if (setOfflineInZK) {
// get the version of the znode after setting it to OFFLINE. // get the version of the znode after setting it to OFFLINE.
// versionOfOfflineNode will be -1 if the znode was not set to OFFLINE // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
versionOfOfflineNode = setOfflineInZooKeeper(state, hijack, versionOfOfflineNode = setOfflineInZooKeeper(currentState, hijack);
regionAlreadyInTransitionException);
if (versionOfOfflineNode != -1) { if (versionOfOfflineNode != -1) {
if (isDisabledorDisablingRegionInRIT(region)) { if (isDisabledorDisablingRegionInRIT(region)) {
return; return;
@ -1430,7 +1432,8 @@ public class AssignmentManager extends ZooKeeperListener {
LOG.debug("Server stopped; skipping assign of " + state); LOG.debug("Server stopped; skipping assign of " + state);
return; return;
} }
RegionPlan plan = getRegionPlan(state, !regionAlreadyInTransitionException && forceNewPlan); RegionPlan plan = getRegionPlan(state,
!regionAlreadyInTransitionException && !serverNotRunningYet && forceNewPlan);
if (plan == null) { if (plan == null) {
LOG.debug("Unable to determine a plan to assign " + state); LOG.debug("Unable to determine a plan to assign " + state);
this.timeoutMonitor.setAllRegionServersOffline(true); this.timeoutMonitor.setAllRegionServersOffline(true);
@ -1440,7 +1443,7 @@ public class AssignmentManager extends ZooKeeperListener {
LOG.info("Assigning region " + state.getRegion().getRegionNameAsString() + LOG.info("Assigning region " + state.getRegion().getRegionNameAsString() +
" to " + plan.getDestination().toString()); " to " + plan.getDestination().toString());
// Transition RegionState to PENDING_OPEN // Transition RegionState to PENDING_OPEN
regionStates.updateRegionState(state.getRegion(), currentState = regionStates.updateRegionState(state.getRegion(),
RegionState.State.PENDING_OPEN, System.currentTimeMillis(), RegionState.State.PENDING_OPEN, System.currentTimeMillis(),
plan.getDestination()); plan.getDestination());
// Send OPEN RPC. This can fail if the server on other end is is not up. // Send OPEN RPC. This can fail if the server on other end is is not up.
@ -1457,34 +1460,64 @@ public class AssignmentManager extends ZooKeeperListener {
} catch (Throwable t) { } catch (Throwable t) {
if (t instanceof RemoteException) { if (t instanceof RemoteException) {
t = ((RemoteException) t).unwrapRemoteException(); t = ((RemoteException) t).unwrapRemoteException();
if (t instanceof RegionAlreadyInTransitionException) { }
regionAlreadyInTransitionException = true; regionAlreadyInTransitionException = false;
if (LOG.isDebugEnabled()) { serverNotRunningYet = false;
LOG.debug("Failed assignment in: " + plan.getDestination() + " due to " if (t instanceof RegionAlreadyInTransitionException) {
+ t.getMessage()); regionAlreadyInTransitionException = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Failed assignment in: " + plan.getDestination() + " due to "
+ t.getMessage());
}
} else if (t instanceof ServerNotRunningYetException) {
if (maxRegionServerStartupWaitTime < 0) {
maxRegionServerStartupWaitTime = System.currentTimeMillis() +
this.server.getConfiguration().
getLong("hbase.regionserver.rpc.startup.waittime", 60000);
}
try {
long now = System.currentTimeMillis();
if (now < maxRegionServerStartupWaitTime) {
LOG.debug("Server is not yet up; waiting up to " +
(maxRegionServerStartupWaitTime - now) + "ms", t);
serverNotRunningYet = true;
Thread.sleep(100);
i--; // reset the try count
} else {
LOG.debug("Server is not up for a while; try a new one", t);
} }
} catch (InterruptedException ie) {
LOG.warn("Failed to assign "
+ state.getRegion().getRegionNameAsString() + " since interrupted", ie);
Thread.currentThread().interrupt();
return;
} }
} }
LOG.warn("Failed assignment of " LOG.warn("Failed assignment of "
+ state.getRegion().getRegionNameAsString() + state.getRegion().getRegionNameAsString()
+ " to " + " to "
+ plan.getDestination() + plan.getDestination()
+ ", trying to assign " + ", trying to assign "
+ (regionAlreadyInTransitionException ? "to the same region server" + (regionAlreadyInTransitionException || serverNotRunningYet
+ " because of RegionAlreadyInTransitionException;" : "elsewhere instead; ") ? "to the same region server because of "
+ "retry=" + i, t); + "RegionAlreadyInTransitionException/ServerNotRunningYetException;"
: "elsewhere instead; ")
+ "retry=" + i, t);
// Clean out plan we failed execute and one that doesn't look like it'll // Clean out plan we failed execute and one that doesn't look like it'll
// succeed anyways; we need a new plan! // succeed anyways; we need a new plan!
// Transition back to OFFLINE // Transition back to OFFLINE
regionStates.updateRegionState( currentState = regionStates.updateRegionState(
state.getRegion(), RegionState.State.OFFLINE); state.getRegion(), RegionState.State.OFFLINE);
// If region opened on destination of present plan, reassigning to new // If region opened on destination of present plan, reassigning to new
// RS may cause double assignments. In case of RegionAlreadyInTransitionException // RS may cause double assignments. In case of RegionAlreadyInTransitionException
// reassigning to same RS. // reassigning to same RS.
RegionPlan newPlan = plan; RegionPlan newPlan = plan;
if (!regionAlreadyInTransitionException) { if (!regionAlreadyInTransitionException && !serverNotRunningYet) {
// Force a new plan and reassign. Will return null if no servers. // Force a new plan and reassign. Will return null if no servers.
newPlan = getRegionPlan(state, plan.getDestination(), true); // The new plan could be the same as the existing plan since we don't
// exclude the server of the original plan, which should not be
// excluded since it could be the only server up now.
newPlan = getRegionPlan(state, true);
} }
if (newPlan == null) { if (newPlan == null) {
this.timeoutMonitor.setAllRegionServersOffline(true); this.timeoutMonitor.setAllRegionServersOffline(true);
@ -1537,24 +1570,16 @@ public class AssignmentManager extends ZooKeeperListener {
* @param state * @param state
* @param hijack * @param hijack
* - true if needs to be hijacked and reassigned, false otherwise. * - true if needs to be hijacked and reassigned, false otherwise.
* @param regionAlreadyInTransitionException
* - true if we need to retry assignment because of RegionAlreadyInTransitionException.
* @return the version of the offline node if setting of the OFFLINE node was * @return the version of the offline node if setting of the OFFLINE node was
* successful, -1 otherwise. * successful, -1 otherwise.
*/ */
int setOfflineInZooKeeper(final RegionState state, boolean hijack, int setOfflineInZooKeeper(final RegionState state, boolean hijack) {
boolean regionAlreadyInTransitionException) {
// In case of reassignment the current state in memory need not be // In case of reassignment the current state in memory need not be
// OFFLINE. // OFFLINE.
if (!hijack && !state.isClosed() && !state.isOffline()) { if (!hijack && !state.isClosed() && !state.isOffline()) {
if (!regionAlreadyInTransitionException ) { String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE."; this.server.abort(msg, new IllegalStateException(msg));
this.server.abort(msg, new IllegalStateException(msg)); return -1;
return -1;
} else {
LOG.debug("Unexpected state : " + state
+ " but retrying to assign because RegionAlreadyInTransitionException.");
}
} }
boolean allowZNodeCreation = false; boolean allowZNodeCreation = false;
// Under reassignment if the current state is PENDING_OPEN // Under reassignment if the current state is PENDING_OPEN