HBASE-12403 IntegrationTestMTTR flaky due to aggressive RS restart timeout
This commit is contained in:
parent
0505072c51
commit
687710eb28
|
@ -43,19 +43,43 @@ import org.apache.hadoop.hbase.util.Bytes;
|
|||
*/
|
||||
public class Action {
|
||||
|
||||
public static final String KILL_MASTER_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.killmastertimeout";
|
||||
public static final String START_MASTER_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.startmastertimeout";
|
||||
public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
|
||||
public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
|
||||
|
||||
protected static Log LOG = LogFactory.getLog(Action.class);
|
||||
|
||||
protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
|
||||
protected ActionContext context;
|
||||
protected HBaseCluster cluster;
|
||||
protected ClusterStatus initialStatus;
|
||||
protected ServerName[] initialServers;
|
||||
|
||||
protected long killMasterTimeout;
|
||||
protected long startMasterTimeout;
|
||||
protected long killRsTimeout;
|
||||
protected long startRsTimeout;
|
||||
|
||||
public void init(ActionContext context) throws IOException {
|
||||
this.context = context;
|
||||
cluster = context.getHBaseCluster();
|
||||
initialStatus = cluster.getInitialClusterStatus();
|
||||
Collection<ServerName> regionServers = initialStatus.getServers();
|
||||
initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
|
||||
|
||||
killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
|
||||
KILL_MASTER_TIMEOUT_DEFAULT);
|
||||
startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
|
||||
START_MASTER_TIMEOUT_DEFAULT);
|
||||
killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
|
||||
startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
|
||||
}
|
||||
|
||||
public void perform() throws Exception { }
|
||||
|
@ -84,21 +108,21 @@ public class Action {
|
|||
protected void killMaster(ServerName server) throws IOException {
|
||||
LOG.info("Killing master:" + server);
|
||||
cluster.killMaster(server);
|
||||
cluster.waitForMasterToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
|
||||
cluster.waitForMasterToStop(server, killMasterTimeout);
|
||||
LOG.info("Killed master server:" + server);
|
||||
}
|
||||
|
||||
protected void startMaster(ServerName server) throws IOException {
|
||||
LOG.info("Starting master:" + server.getHostname());
|
||||
cluster.startMaster(server.getHostname());
|
||||
cluster.waitForActiveAndReadyMaster(PolicyBasedChaosMonkey.TIMEOUT);
|
||||
cluster.waitForActiveAndReadyMaster(startMasterTimeout);
|
||||
LOG.info("Started master: " + server);
|
||||
}
|
||||
|
||||
protected void killRs(ServerName server) throws IOException {
|
||||
LOG.info("Killing region server:" + server);
|
||||
cluster.killRegionServer(server);
|
||||
cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
|
||||
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||
LOG.info("Killed region server:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
|
@ -106,7 +130,7 @@ public class Action {
|
|||
protected void startRs(ServerName server) throws IOException {
|
||||
LOG.info("Starting region server:" + server.getHostname());
|
||||
cluster.startRegionServer(server.getHostname());
|
||||
cluster.waitForRegionServerToStart(server.getHostname(), PolicyBasedChaosMonkey.TIMEOUT);
|
||||
cluster.waitForRegionServerToStart(server.getHostname(), startRsTimeout);
|
||||
LOG.info("Started region server:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
|
|
|
@ -183,6 +183,10 @@ public class IntegrationTestMTTR {
|
|||
}
|
||||
|
||||
private static void setupActions() throws IOException {
|
||||
// allow a little more time for RS restart actions because RS start depends on having a master
|
||||
// to report to and the master is also being monkeyed.
|
||||
util.getConfiguration().setLong(Action.START_RS_TIMEOUT_KEY, 3 * 60 * 1000);
|
||||
|
||||
// Set up the action that will restart a region server holding a region from our table
|
||||
// because this table should only have one region we should be good.
|
||||
restartRSAction = new RestartRsHoldingTableAction(sleepTime, tableName.getNameAsString());
|
||||
|
|
Loading…
Reference in New Issue