HBASE-12403 IntegrationTestMTTR flaky due to aggressive RS restart timeout

This commit is contained in:
Nick Dimiduk 2014-10-31 16:34:48 -07:00
parent a1f59d8e1b
commit 3c06b48181
2 changed files with 32 additions and 4 deletions

View File

@ -43,19 +43,43 @@ import org.apache.hadoop.hbase.util.Bytes;
*/
public class Action {
public static final String KILL_MASTER_TIMEOUT_KEY =
"hbase.chaosmonkey.action.killmastertimeout";
public static final String START_MASTER_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startmastertimeout";
public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
protected static Log LOG = LogFactory.getLog(Action.class);
protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected ActionContext context;
protected HBaseCluster cluster;
protected ClusterStatus initialStatus;
protected ServerName[] initialServers;
protected long killMasterTimeout;
protected long startMasterTimeout;
protected long killRsTimeout;
protected long startRsTimeout;
public void init(ActionContext context) throws IOException {
this.context = context;
cluster = context.getHBaseCluster();
initialStatus = cluster.getInitialClusterStatus();
Collection<ServerName> regionServers = initialStatus.getServers();
initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
KILL_MASTER_TIMEOUT_DEFAULT);
startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
START_MASTER_TIMEOUT_DEFAULT);
killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
}
public void perform() throws Exception { }
@ -84,21 +108,21 @@ public class Action {
protected void killMaster(ServerName server) throws IOException {
LOG.info("Killing master:" + server);
cluster.killMaster(server);
cluster.waitForMasterToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
cluster.waitForMasterToStop(server, killMasterTimeout);
LOG.info("Killed master server:" + server);
}
protected void startMaster(ServerName server) throws IOException {
LOG.info("Starting master:" + server.getHostname());
cluster.startMaster(server.getHostname());
cluster.waitForActiveAndReadyMaster(PolicyBasedChaosMonkey.TIMEOUT);
cluster.waitForActiveAndReadyMaster(startMasterTimeout);
LOG.info("Started master: " + server);
}
protected void killRs(ServerName server) throws IOException {
LOG.info("Killing region server:" + server);
cluster.killRegionServer(server);
cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
cluster.waitForRegionServerToStop(server, killRsTimeout);
LOG.info("Killed region server:" + server + ". Reported num of rs:"
+ cluster.getClusterStatus().getServersSize());
}
@ -106,7 +130,7 @@ public class Action {
protected void startRs(ServerName server) throws IOException {
LOG.info("Starting region server:" + server.getHostname());
cluster.startRegionServer(server.getHostname());
cluster.waitForRegionServerToStart(server.getHostname(), PolicyBasedChaosMonkey.TIMEOUT);
cluster.waitForRegionServerToStart(server.getHostname(), startRsTimeout);
LOG.info("Started region server:" + server + ". Reported num of rs:"
+ cluster.getClusterStatus().getServersSize());
}

View File

@ -183,6 +183,10 @@ public class IntegrationTestMTTR {
}
private static void setupActions() throws IOException {
// allow a little more time for RS restart actions because RS start depends on having a master
// to report to and the master is also being monkeyed.
util.getConfiguration().setLong(Action.START_RS_TIMEOUT_KEY, 3 * 60 * 1000);
// Set up the action that will restart a region server holding a region from our table
// because this table should only have one region we should be good.
restartRSAction = new RestartRsHoldingTableAction(sleepTime, tableName.getNameAsString());