HBASE-12844 ServerManager.isServerReacable() should sleep between retries
This commit is contained in:
parent
7e872a81a9
commit
51575197de
|
@ -63,6 +63,8 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
|
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.hadoop.hbase.util.Pair;
|
import org.apache.hadoop.hbase.util.Pair;
|
||||||
|
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||||
|
import org.apache.hadoop.hbase.util.RetryCounterFactory;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
|
@ -141,6 +143,8 @@ public class ServerManager {
|
||||||
private final long maxSkew;
|
private final long maxSkew;
|
||||||
private final long warningSkew;
|
private final long warningSkew;
|
||||||
|
|
||||||
|
private final RetryCounterFactory pingRetryCounterFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of region servers which are dead but not processed immediately. If one
|
* Set of region servers which are dead but not processed immediately. If one
|
||||||
* server died before master enables ServerShutdownHandler, the server will be
|
* server died before master enables ServerShutdownHandler, the server will be
|
||||||
|
@ -199,6 +203,11 @@ public class ServerManager {
|
||||||
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
|
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
|
||||||
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
|
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
|
||||||
this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
|
this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
|
||||||
|
int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
|
||||||
|
"hbase.master.maximum.ping.server.attempts", 10));
|
||||||
|
int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
|
||||||
|
"hbase.master.ping.server.retry.sleep.interval", 100));
|
||||||
|
this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -793,9 +802,9 @@ public class ServerManager {
|
||||||
*/
|
*/
|
||||||
public boolean isServerReachable(ServerName server) {
|
public boolean isServerReachable(ServerName server) {
|
||||||
if (server == null) throw new NullPointerException("Passed server is null");
|
if (server == null) throw new NullPointerException("Passed server is null");
|
||||||
int maximumAttempts = Math.max(1, master.getConfiguration().getInt(
|
|
||||||
"hbase.master.maximum.ping.server.attempts", 10));
|
RetryCounter retryCounter = pingRetryCounterFactory.create();
|
||||||
for (int i = 0; i < maximumAttempts; i++) {
|
while (retryCounter.shouldRetry()) {
|
||||||
try {
|
try {
|
||||||
AdminService.BlockingInterface admin = getRsAdmin(server);
|
AdminService.BlockingInterface admin = getRsAdmin(server);
|
||||||
if (admin != null) {
|
if (admin != null) {
|
||||||
|
@ -804,8 +813,13 @@ public class ServerManager {
|
||||||
&& server.getStartcode() == info.getServerName().getStartCode();
|
&& server.getStartcode() == info.getServerName().getStartCode();
|
||||||
}
|
}
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
LOG.debug("Couldn't reach " + server + ", try=" + i
|
LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes()
|
||||||
+ " of " + maximumAttempts, ioe);
|
+ " of " + retryCounter.getMaxAttempts(), ioe);
|
||||||
|
try {
|
||||||
|
retryCounter.sleepUntilNextRetry();
|
||||||
|
} catch(InterruptedException ie) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -52,7 +52,6 @@ import org.apache.hadoop.hbase.UnknownRegionException;
|
||||||
import org.apache.hadoop.hbase.Waiter;
|
import org.apache.hadoop.hbase.Waiter;
|
||||||
import org.apache.hadoop.hbase.client.Admin;
|
import org.apache.hadoop.hbase.client.Admin;
|
||||||
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
||||||
import org.apache.hadoop.hbase.client.HTable;
|
|
||||||
import org.apache.hadoop.hbase.client.Result;
|
import org.apache.hadoop.hbase.client.Result;
|
||||||
import org.apache.hadoop.hbase.client.Table;
|
import org.apache.hadoop.hbase.client.Table;
|
||||||
import org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager;
|
import org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager;
|
||||||
|
@ -98,6 +97,8 @@ public class TestAssignmentManagerOnCluster {
|
||||||
MyRegionObserver.class, RegionObserver.class);
|
MyRegionObserver.class, RegionObserver.class);
|
||||||
// Reduce the maximum attempts to speed up the test
|
// Reduce the maximum attempts to speed up the test
|
||||||
conf.setInt("hbase.assignment.maximum.attempts", 3);
|
conf.setInt("hbase.assignment.maximum.attempts", 3);
|
||||||
|
conf.setInt("hbase.master.maximum.ping.server.attempts", 3);
|
||||||
|
conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1);
|
||||||
|
|
||||||
TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class);
|
TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class);
|
||||||
admin = TEST_UTIL.getHBaseAdmin();
|
admin = TEST_UTIL.getHBaseAdmin();
|
||||||
|
@ -205,7 +206,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This tests region assignment on a simulated restarted server
|
* This tests region assignment on a simulated restarted server
|
||||||
*/
|
*/
|
||||||
|
@ -1087,7 +1088,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
cluster.startRegionServer();
|
cluster.startRegionServer();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that region state transition call is idempotent
|
* Test that region state transition call is idempotent
|
||||||
*/
|
*/
|
||||||
|
@ -1121,7 +1122,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test concurrent updates to meta when meta is not on master
|
* Test concurrent updates to meta when meta is not on master
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
|
@ -1177,7 +1178,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
assertTrue(count == 100);
|
assertTrue(count == 100);
|
||||||
rss.stop();
|
rss.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
static class MyLoadBalancer extends StochasticLoadBalancer {
|
static class MyLoadBalancer extends StochasticLoadBalancer {
|
||||||
// For this region, if specified, always assign to nowhere
|
// For this region, if specified, always assign to nowhere
|
||||||
static volatile String controledRegion = null;
|
static volatile String controledRegion = null;
|
||||||
|
|
Loading…
Reference in New Issue