From 51575197de78fa8eb43a3eaaa3cd377ac87c7f43 Mon Sep 17 00:00:00 2001 From: Enis Soztutar Date: Wed, 14 Jan 2015 15:45:46 -0800 Subject: [PATCH] HBASE-12844 ServerManager.isServerReacable() should sleep between retries --- .../hadoop/hbase/master/ServerManager.java | 24 +++++++++++++++---- .../TestAssignmentManagerOnCluster.java | 11 +++++---- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 796cc8a2029..c42f3143bd6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -63,6 +63,8 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.zookeeper.KeeperException; @@ -141,6 +143,8 @@ public class ServerManager { private final long maxSkew; private final long warningSkew; + private final RetryCounterFactory pingRetryCounterFactory; + /** * Set of region servers which are dead but not processed immediately. If one * server died before master enables ServerShutdownHandler, the server will be @@ -199,6 +203,11 @@ public class ServerManager { maxSkew = c.getLong("hbase.master.maxclockskew", 30000); warningSkew = c.getLong("hbase.master.warningclockskew", 10000); this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null; + int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt( + "hbase.master.maximum.ping.server.attempts", 10)); + int pingSleepInterval = Math.max(1, master.getConfiguration().getInt( + "hbase.master.ping.server.retry.sleep.interval", 100)); + this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval); } /** @@ -793,9 +802,9 @@ public class ServerManager { */ public boolean isServerReachable(ServerName server) { if (server == null) throw new NullPointerException("Passed server is null"); - int maximumAttempts = Math.max(1, master.getConfiguration().getInt( - "hbase.master.maximum.ping.server.attempts", 10)); - for (int i = 0; i < maximumAttempts; i++) { + + RetryCounter retryCounter = pingRetryCounterFactory.create(); + while (retryCounter.shouldRetry()) { try { AdminService.BlockingInterface admin = getRsAdmin(server); if (admin != null) { @@ -804,8 +813,13 @@ public class ServerManager { && server.getStartcode() == info.getServerName().getStartCode(); } } catch (IOException ioe) { - LOG.debug("Couldn't reach " + server + ", try=" + i - + " of " + maximumAttempts, ioe); + LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + + " of " + retryCounter.getMaxAttempts(), ioe); + try { + retryCounter.sleepUntilNextRetry(); + } catch(InterruptedException ie) { + Thread.currentThread().interrupt(); + } } } return false; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index d1769e8c240..5a0dffaed67 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -52,7 +52,6 @@ import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.HBaseAdmin; -import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager; @@ -98,6 +97,8 @@ public class TestAssignmentManagerOnCluster { MyRegionObserver.class, RegionObserver.class); // Reduce the maximum attempts to speed up the test conf.setInt("hbase.assignment.maximum.attempts", 3); + conf.setInt("hbase.master.maximum.ping.server.attempts", 3); + conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1); TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class); admin = TEST_UTIL.getHBaseAdmin(); @@ -205,7 +206,7 @@ public class TestAssignmentManagerOnCluster { TEST_UTIL.deleteTable(Bytes.toBytes(table)); } } - + /** * This tests region assignment on a simulated restarted server */ @@ -1087,7 +1088,7 @@ public class TestAssignmentManagerOnCluster { cluster.startRegionServer(); } } - + /** * Test that region state transition call is idempotent */ @@ -1121,7 +1122,7 @@ public class TestAssignmentManagerOnCluster { TEST_UTIL.deleteTable(Bytes.toBytes(table)); } } - + /** * Test concurrent updates to meta when meta is not on master * @throws Exception @@ -1177,7 +1178,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(count == 100); rss.stop(); } - + static class MyLoadBalancer extends StochasticLoadBalancer { // For this region, if specified, always assign to nowhere static volatile String controledRegion = null;