From 687710eb2869817952461796d04e35de29a98fdb Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Fri, 31 Oct 2014 16:34:48 -0700 Subject: [PATCH] HBASE-12403 IntegrationTestMTTR flaky due to aggressive RS restart timeout --- .../hadoop/hbase/chaos/actions/Action.java | 32 ++++++++++++++++--- .../hbase/mttr/IntegrationTestMTTR.java | 4 +++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index dea412f49a5..c01ce0fd30c 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -43,19 +43,43 @@ import org.apache.hadoop.hbase.util.Bytes; */ public class Action { + public static final String KILL_MASTER_TIMEOUT_KEY = + "hbase.chaosmonkey.action.killmastertimeout"; + public static final String START_MASTER_TIMEOUT_KEY = + "hbase.chaosmonkey.action.startmastertimeout"; + public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout"; + public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout"; + protected static Log LOG = LogFactory.getLog(Action.class); + protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected ActionContext context; protected HBaseCluster cluster; protected ClusterStatus initialStatus; protected ServerName[] initialServers; + protected long killMasterTimeout; + protected long startMasterTimeout; + protected long killRsTimeout; + protected long startRsTimeout; + public void init(ActionContext context) throws IOException { this.context = context; cluster = context.getHBaseCluster(); initialStatus = cluster.getInitialClusterStatus(); Collection regionServers = initialStatus.getServers(); initialServers = regionServers.toArray(new ServerName[regionServers.size()]); + + killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY, + KILL_MASTER_TIMEOUT_DEFAULT); + startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY, + START_MASTER_TIMEOUT_DEFAULT); + killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT); + startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT); } public void perform() throws Exception { } @@ -84,21 +108,21 @@ public class Action { protected void killMaster(ServerName server) throws IOException { LOG.info("Killing master:" + server); cluster.killMaster(server); - cluster.waitForMasterToStop(server, PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForMasterToStop(server, killMasterTimeout); LOG.info("Killed master server:" + server); } protected void startMaster(ServerName server) throws IOException { LOG.info("Starting master:" + server.getHostname()); cluster.startMaster(server.getHostname()); - cluster.waitForActiveAndReadyMaster(PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForActiveAndReadyMaster(startMasterTimeout); LOG.info("Started master: " + server); } protected void killRs(ServerName server) throws IOException { LOG.info("Killing region server:" + server); cluster.killRegionServer(server); - cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForRegionServerToStop(server, killRsTimeout); LOG.info("Killed region server:" + server + ". Reported num of rs:" + cluster.getClusterStatus().getServersSize()); } @@ -106,7 +130,7 @@ public class Action { protected void startRs(ServerName server) throws IOException { LOG.info("Starting region server:" + server.getHostname()); cluster.startRegionServer(server.getHostname()); - cluster.waitForRegionServerToStart(server.getHostname(), PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForRegionServerToStart(server.getHostname(), startRsTimeout); LOG.info("Started region server:" + server + ". Reported num of rs:" + cluster.getClusterStatus().getServersSize()); } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java index b34eb9e3355..de5281bedf6 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java @@ -183,6 +183,10 @@ public class IntegrationTestMTTR { } private static void setupActions() throws IOException { + // allow a little more time for RS restart actions because RS start depends on having a master + // to report to and the master is also being monkeyed. + util.getConfiguration().setLong(Action.START_RS_TIMEOUT_KEY, 3 * 60 * 1000); + // Set up the action that will restart a region server holding a region from our table // because this table should only have one region we should be good. restartRSAction = new RestartRsHoldingTableAction(sleepTime, tableName.getNameAsString());