From 8ecba131afa06f4aed7a0d57f2e0d698d3b294e9 Mon Sep 17 00:00:00 2001 From: Enis Soztutar Date: Wed, 14 Jan 2015 15:45:46 -0800 Subject: [PATCH] HBASE-12844 ServerManager.isServerReacable() should sleep between retries Conflicts: hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java --- .../hadoop/hbase/master/ServerManager.java | 25 +++++++++++++++---- .../TestAssignmentManagerOnCluster.java | 6 +++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 8be32fef315..04c42021d08 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -63,6 +63,9 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Triple; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.zookeeper.KeeperException; @@ -141,6 +144,8 @@ public class ServerManager { private final long maxSkew; private final long warningSkew; + private final RetryCounterFactory pingRetryCounterFactory; + /** * Set of region servers which are dead but not processed immediately. If one * server died before master enables ServerShutdownHandler, the server will be @@ -199,6 +204,11 @@ public class ServerManager { maxSkew = c.getLong("hbase.master.maxclockskew", 30000); warningSkew = c.getLong("hbase.master.warningclockskew", 10000); this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null; + int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt( + "hbase.master.maximum.ping.server.attempts", 10)); + int pingSleepInterval = Math.max(1, master.getConfiguration().getInt( + "hbase.master.ping.server.retry.sleep.interval", 100)); + this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval); } /** @@ -801,9 +811,9 @@ public class ServerManager { */ public boolean isServerReachable(ServerName server) { if (server == null) throw new NullPointerException("Passed server is null"); - int maximumAttempts = Math.max(1, master.getConfiguration().getInt( - "hbase.master.maximum.ping.server.attempts", 10)); - for (int i = 0; i < maximumAttempts; i++) { + + RetryCounter retryCounter = pingRetryCounterFactory.create(); + while (retryCounter.shouldRetry()) { try { AdminService.BlockingInterface admin = getRsAdmin(server); if (admin != null) { @@ -812,8 +822,13 @@ public class ServerManager { && server.getStartcode() == info.getServerName().getStartCode(); } } catch (IOException ioe) { - LOG.debug("Couldn't reach " + server + ", try=" + i - + " of " + maximumAttempts, ioe); + LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + + " of " + retryCounter.getMaxAttempts(), ioe); + try { + retryCounter.sleepUntilNextRetry(); + } catch(InterruptedException ie) { + Thread.currentThread().interrupt(); + } } } return false; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index bf44147138d..a4ae2fe36c2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -105,6 +105,8 @@ public class TestAssignmentManagerOnCluster { conf.setInt("hbase.assignment.maximum.attempts", 3); // Put meta on master to avoid meta server shutdown handling conf.set("hbase.balancer.tablesOnMaster", "hbase:meta"); + conf.setInt("hbase.master.maximum.ping.server.attempts", 3); + conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1); TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class); admin = TEST_UTIL.getHBaseAdmin(); @@ -1219,7 +1221,7 @@ public class TestAssignmentManagerOnCluster { TEST_UTIL.deleteTable(Bytes.toBytes(table)); } } - + /** * Test concurrent updates to meta when meta is not on master * @throws Exception @@ -1279,7 +1281,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(count == 100); rss.stop(); } - + static class MyLoadBalancer extends StochasticLoadBalancer { // For this region, if specified, always assign to nowhere static volatile String controledRegion = null;