HBASE-12844 ServerManager.isServerReacable() should sleep between retries

This commit is contained in:
Enis Soztutar 2015-01-14 15:45:46 -08:00
parent 7e872a81a9
commit 51575197de
2 changed files with 25 additions and 10 deletions

View File

@ -63,6 +63,8 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.RetryCounterFactory;
import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
@ -141,6 +143,8 @@ public class ServerManager {
private final long maxSkew; private final long maxSkew;
private final long warningSkew; private final long warningSkew;
private final RetryCounterFactory pingRetryCounterFactory;
/** /**
* Set of region servers which are dead but not processed immediately. If one * Set of region servers which are dead but not processed immediately. If one
* server died before master enables ServerShutdownHandler, the server will be * server died before master enables ServerShutdownHandler, the server will be
@ -199,6 +203,11 @@ public class ServerManager {
maxSkew = c.getLong("hbase.master.maxclockskew", 30000); maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
warningSkew = c.getLong("hbase.master.warningclockskew", 10000); warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null; this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
"hbase.master.maximum.ping.server.attempts", 10));
int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
"hbase.master.ping.server.retry.sleep.interval", 100));
this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
} }
/** /**
@ -793,9 +802,9 @@ public class ServerManager {
*/ */
public boolean isServerReachable(ServerName server) { public boolean isServerReachable(ServerName server) {
if (server == null) throw new NullPointerException("Passed server is null"); if (server == null) throw new NullPointerException("Passed server is null");
int maximumAttempts = Math.max(1, master.getConfiguration().getInt(
"hbase.master.maximum.ping.server.attempts", 10)); RetryCounter retryCounter = pingRetryCounterFactory.create();
for (int i = 0; i < maximumAttempts; i++) { while (retryCounter.shouldRetry()) {
try { try {
AdminService.BlockingInterface admin = getRsAdmin(server); AdminService.BlockingInterface admin = getRsAdmin(server);
if (admin != null) { if (admin != null) {
@ -804,8 +813,13 @@ public class ServerManager {
&& server.getStartcode() == info.getServerName().getStartCode(); && server.getStartcode() == info.getServerName().getStartCode();
} }
} catch (IOException ioe) { } catch (IOException ioe) {
LOG.debug("Couldn't reach " + server + ", try=" + i LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes()
+ " of " + maximumAttempts, ioe); + " of " + retryCounter.getMaxAttempts(), ioe);
try {
retryCounter.sleepUntilNextRetry();
} catch(InterruptedException ie) {
Thread.currentThread().interrupt();
}
} }
} }
return false; return false;

View File

@ -52,7 +52,6 @@ import org.apache.hadoop.hbase.UnknownRegionException;
import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager; import org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager;
@ -98,6 +97,8 @@ public class TestAssignmentManagerOnCluster {
MyRegionObserver.class, RegionObserver.class); MyRegionObserver.class, RegionObserver.class);
// Reduce the maximum attempts to speed up the test // Reduce the maximum attempts to speed up the test
conf.setInt("hbase.assignment.maximum.attempts", 3); conf.setInt("hbase.assignment.maximum.attempts", 3);
conf.setInt("hbase.master.maximum.ping.server.attempts", 3);
conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1);
TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class); TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class);
admin = TEST_UTIL.getHBaseAdmin(); admin = TEST_UTIL.getHBaseAdmin();
@ -205,7 +206,7 @@ public class TestAssignmentManagerOnCluster {
TEST_UTIL.deleteTable(Bytes.toBytes(table)); TEST_UTIL.deleteTable(Bytes.toBytes(table));
} }
} }
/** /**
* This tests region assignment on a simulated restarted server * This tests region assignment on a simulated restarted server
*/ */
@ -1087,7 +1088,7 @@ public class TestAssignmentManagerOnCluster {
cluster.startRegionServer(); cluster.startRegionServer();
} }
} }
/** /**
* Test that region state transition call is idempotent * Test that region state transition call is idempotent
*/ */
@ -1121,7 +1122,7 @@ public class TestAssignmentManagerOnCluster {
TEST_UTIL.deleteTable(Bytes.toBytes(table)); TEST_UTIL.deleteTable(Bytes.toBytes(table));
} }
} }
/** /**
* Test concurrent updates to meta when meta is not on master * Test concurrent updates to meta when meta is not on master
* @throws Exception * @throws Exception
@ -1177,7 +1178,7 @@ public class TestAssignmentManagerOnCluster {
assertTrue(count == 100); assertTrue(count == 100);
rss.stop(); rss.stop();
} }
static class MyLoadBalancer extends StochasticLoadBalancer { static class MyLoadBalancer extends StochasticLoadBalancer {
// For this region, if specified, always assign to nowhere // For this region, if specified, always assign to nowhere
static volatile String controledRegion = null; static volatile String controledRegion = null;