From e524b5b3305d8acd950527f2c80f3ef34fde15ce Mon Sep 17 00:00:00 2001 From: jeffreyz Date: Sat, 5 Apr 2014 00:58:55 +0000 Subject: [PATCH] HBASE-10895: unassign a region fails due to the hosting region server is in FailedServerList git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1584947 13f79535-47bb-0310-9956-ffa450edef68 --- .../hbase/master/AssignmentManager.java | 46 ++++++++++++------- .../TestAssignmentManagerOnCluster.java | 6 +-- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index c25f712a637..dbd3382ffd3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -1698,10 +1698,10 @@ public class AssignmentManager extends ZooKeeperListener { if (t instanceof RemoteException) { t = ((RemoteException)t).unwrapRemoteException(); } + boolean logRetries = true; if (t instanceof NotServingRegionException || t instanceof RegionServerStoppedException - || t instanceof ServerNotRunningYetException - || t instanceof FailedServerException) { + || t instanceof ServerNotRunningYetException) { LOG.debug("Offline " + region.getRegionNameAsString() + ", it's not any more on " + server, t); if (transitionInZK) { @@ -1711,34 +1711,48 @@ public class AssignmentManager extends ZooKeeperListener { regionOffline(region); } return; - } else if (state != null - && t instanceof RegionAlreadyInTransitionException) { - // RS is already processing this region, only need to update the timestamp - LOG.debug("update " + state + " the timestamp."); - state.updateTimestampToNow(); - if (maxWaitTime < 0) { - maxWaitTime = EnvironmentEdgeManager.currentTimeMillis() - + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } - try { + } else if ((t instanceof FailedServerException) || (state != null && + t instanceof RegionAlreadyInTransitionException)) { + long sleepTime = 0; + Configuration conf = this.server.getConfiguration(); + if(t instanceof FailedServerException) { + sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); + } else { + // RS is already processing this region, only need to update the timestamp + LOG.debug("update " + state + " the timestamp."); + state.updateTimestampToNow(); + if (maxWaitTime < 0) { + maxWaitTime = + EnvironmentEdgeManager.currentTimeMillis() + + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME, + DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } long now = EnvironmentEdgeManager.currentTimeMillis(); if (now < maxWaitTime) { LOG.debug("Region is already in transition; " + "waiting up to " + (maxWaitTime - now) + "ms", t); - Thread.sleep(100); + sleepTime = 100; i--; // reset the try count + logRetries = false; + } + } + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); } } catch (InterruptedException ie) { LOG.warn("Failed to unassign " + region.getRegionNameAsString() + " since interrupted", ie); Thread.currentThread().interrupt(); - if (!tomActivated) { + if (!tomActivated && state != null) { regionStates.updateRegionState(region, State.FAILED_CLOSE); } return; } - } else { + } + + if (logRetries) { LOG.info("Server " + server + " returned " + t + " for " + region.getRegionNameAsString() + ", try=" + i + " of " + this.maximumAttempts, t); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index baa458d85be..8ffc9faa2ae 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -141,7 +141,7 @@ public class TestAssignmentManagerOnCluster { @Test (timeout=120000) public void testAssignRegionOnRestartedServer() throws Exception { String table = "testAssignRegionOnRestartedServer"; - TEST_UTIL.getMiniHBaseCluster().getConf().setInt("hbase.assignment.maximum.attempts", 40); + TEST_UTIL.getMiniHBaseCluster().getConf().setInt("hbase.assignment.maximum.attempts", 20); TEST_UTIL.getMiniHBaseCluster().stopMaster(0); TEST_UTIL.getMiniHBaseCluster().startMaster(); //restart the master so that conf take into affect @@ -754,11 +754,11 @@ public class TestAssignmentManagerOnCluster { // You can't assign a dead region before SSH am.assign(hri, true, true); RegionState state = regionStates.getRegionState(hri); - assertTrue(state.isOffline()); + assertTrue(state.isFailedClose()); // You can't unassign a dead region before SSH either am.unassign(hri, true); - assertTrue(state.isOffline()); + assertTrue(state.isFailedClose()); // Enable SSH so that log can be split master.enableSSH(true);