diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index cb6069541fa..796bc1f27e0 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -97,13 +97,13 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void startRegionServer(String hostname, int port) throws IOException { - LOG.info("Starting RS on: " + hostname); + LOG.info("Starting RS on: {}", hostname); clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port); } @Override public void killRegionServer(ServerName serverName) throws IOException { - LOG.info("Aborting RS: " + serverName.getServerName()); + LOG.info("Aborting RS: {}", serverName.getServerName()); killedRegionServers.add(serverName); clusterManager.kill(ServiceType.HBASE_REGIONSERVER, serverName.getHostname(), serverName.getPort()); @@ -116,7 +116,7 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void stopRegionServer(ServerName serverName) throws IOException { - LOG.info("Stopping RS: " + serverName.getServerName()); + LOG.info("Stopping RS: {}", serverName.getServerName()); clusterManager.stop(ServiceType.HBASE_REGIONSERVER, serverName.getHostname(), serverName.getPort()); } @@ -126,22 +126,36 @@ public class DistributedHBaseCluster extends HBaseCluster { waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout); } + @Override + public void suspendRegionServer(ServerName serverName) throws IOException { + LOG.info("Suspend RS: {}", serverName.getServerName()); + clusterManager.suspend(ServiceType.HBASE_REGIONSERVER, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void resumeRegionServer(ServerName serverName) throws IOException { + LOG.info("Resume RS: {}", serverName.getServerName()); + clusterManager.resume(ServiceType.HBASE_REGIONSERVER, + serverName.getHostname(), serverName.getPort()); + } + @Override public void startZkNode(String hostname, int port) throws IOException { - LOG.info("Starting ZooKeeper node on: " + hostname); + LOG.info("Starting ZooKeeper node on: {}", hostname); clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port); } @Override public void killZkNode(ServerName serverName) throws IOException { - LOG.info("Aborting ZooKeeper node on: " + serverName.getServerName()); + LOG.info("Aborting ZooKeeper node on: {}", serverName.getServerName()); clusterManager.kill(ServiceType.ZOOKEEPER_SERVER, serverName.getHostname(), serverName.getPort()); } @Override public void stopZkNode(ServerName serverName) throws IOException { - LOG.info("Stopping ZooKeeper node: " + serverName.getServerName()); + LOG.info("Stopping ZooKeeper node: {}", serverName.getServerName()); clusterManager.stop(ServiceType.ZOOKEEPER_SERVER, serverName.getHostname(), serverName.getPort()); } @@ -158,21 +172,21 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void startDataNode(ServerName serverName) throws IOException { - LOG.info("Starting data node on: " + serverName.getServerName()); + LOG.info("Starting data node on: {}", serverName.getServerName()); clusterManager.start(ServiceType.HADOOP_DATANODE, serverName.getHostname(), serverName.getPort()); } @Override public void killDataNode(ServerName serverName) throws IOException { - LOG.info("Aborting data node on: " + serverName.getServerName()); + LOG.info("Aborting data node on: {}", serverName.getServerName()); clusterManager.kill(ServiceType.HADOOP_DATANODE, serverName.getHostname(), serverName.getPort()); } @Override public void stopDataNode(ServerName serverName) throws IOException { - LOG.info("Stopping data node on: " + serverName.getServerName()); + LOG.info("Stopping data node on: {}", serverName.getServerName()); clusterManager.stop(ServiceType.HADOOP_DATANODE, serverName.getHostname(), serverName.getPort()); } @@ -189,21 +203,21 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void startNameNode(ServerName serverName) throws IOException { - LOG.info("Starting name node on: " + serverName.getServerName()); + LOG.info("Starting name node on: {}", serverName.getServerName()); clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), serverName.getPort()); } @Override public void killNameNode(ServerName serverName) throws IOException { - LOG.info("Aborting name node on: " + serverName.getServerName()); + LOG.info("Aborting name node on: {}", serverName.getServerName()); clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), serverName.getPort()); } @Override public void stopNameNode(ServerName serverName) throws IOException { - LOG.info("Stopping name node on: " + serverName.getServerName()); + LOG.info("Stopping name node on: {}", serverName.getServerName()); clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), serverName.getPort()); } @@ -220,7 +234,7 @@ public class DistributedHBaseCluster extends HBaseCluster { private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) throws IOException { - LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); + LOG.info("Waiting for service: {} to stop: {}", service, serverName.getServerName()); long start = System.currentTimeMillis(); while ((System.currentTimeMillis() - start) < timeout) { @@ -234,7 +248,7 @@ public class DistributedHBaseCluster extends HBaseCluster { private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout) throws IOException { - LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName()); + LOG.info("Waiting for service: {} to start: ", service, serverName.getServerName()); long start = System.currentTimeMillis(); while ((System.currentTimeMillis() - start) < timeout) { @@ -248,19 +262,19 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void startMaster(String hostname, int port) throws IOException { - LOG.info("Starting Master on: " + hostname + ":" + port); + LOG.info("Starting Master on: {}:{}", hostname, port); clusterManager.start(ServiceType.HBASE_MASTER, hostname, port); } @Override public void killMaster(ServerName serverName) throws IOException { - LOG.info("Aborting Master: " + serverName.getServerName()); + LOG.info("Aborting Master: {}", serverName.getServerName()); clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort()); } @Override public void stopMaster(ServerName serverName) throws IOException { - LOG.info("Stopping Master: " + serverName.getServerName()); + LOG.info("Stopping Master: {}", serverName.getServerName()); clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort()); } @@ -294,7 +308,7 @@ public class DistributedHBaseCluster extends HBaseCluster { regionLoc = locator.getRegionLocation(startKey, true); } if (regionLoc == null) { - LOG.warn("Cannot find region server holding region " + Bytes.toStringBinary(regionName)); + LOG.warn("Cannot find region server holding region {}", Bytes.toStringBinary(regionName)); return null; } return regionLoc.getServerName(); @@ -338,15 +352,15 @@ public class DistributedHBaseCluster extends HBaseCluster { //check whether current master has changed final ServerName initMaster = initial.getMasterName(); if (!ServerName.isSameAddress(initMaster, current.getMasterName())) { - LOG.info("Restoring cluster - Initial active master : " + initMaster.getAddress() + - " has changed to : " + current.getMasterName().getAddress()); + LOG.info("Restoring cluster - Initial active master : {} has changed to : {}", + initMaster.getAddress(), current.getMasterName().getAddress()); // If initial master is stopped, start it, before restoring the state. // It will come up as a backup master, if there is already an active master. try { if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initMaster.getHostname(), initMaster.getPort())) { - LOG.info("Restoring cluster - starting initial active master at:" - + initMaster.getAddress()); + LOG.info("Restoring cluster - starting initial active master at:{}", + initMaster.getAddress()); startMaster(initMaster.getHostname(), initMaster.getPort()); } @@ -356,11 +370,11 @@ public class DistributedHBaseCluster extends HBaseCluster { // 3. Start backup masters for (ServerName currentBackup : current.getBackupMasterNames()) { if (!ServerName.isSameAddress(currentBackup, initMaster)) { - LOG.info("Restoring cluster - stopping backup master: " + currentBackup); + LOG.info("Restoring cluster - stopping backup master: {}", currentBackup); stopMaster(currentBackup); } } - LOG.info("Restoring cluster - stopping active master: " + current.getMasterName()); + LOG.info("Restoring cluster - stopping active master: {}", current.getMasterName()); stopMaster(current.getMasterName()); waitForActiveAndReadyMaster(); // wait so that active master takes over } catch (IOException ex) { @@ -376,8 +390,8 @@ public class DistributedHBaseCluster extends HBaseCluster { if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname(), backup.getPort())) { - LOG.info("Restoring cluster - starting initial backup master: " - + backup.getAddress()); + LOG.info("Restoring cluster - starting initial backup master: {}", + backup.getAddress()); startMaster(backup.getHostname(), backup.getPort()); } } catch (IOException ex) { @@ -401,7 +415,7 @@ public class DistributedHBaseCluster extends HBaseCluster { for (ServerName sn:toStart) { try { if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) { - LOG.info("Restoring cluster - starting initial backup master: " + sn.getAddress()); + LOG.info("Restoring cluster - starting initial backup master: {}", sn.getAddress()); startMaster(sn.getHostname(), sn.getPort()); } } catch (IOException ex) { @@ -412,7 +426,7 @@ public class DistributedHBaseCluster extends HBaseCluster { for (ServerName sn:toKill) { try { if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) { - LOG.info("Restoring cluster - stopping backup master: " + sn.getAddress()); + LOG.info("Restoring cluster - stopping backup master: {}", sn.getAddress()); stopMaster(sn); } } catch (IOException ex) { @@ -421,8 +435,8 @@ public class DistributedHBaseCluster extends HBaseCluster { } } if (!deferred.isEmpty()) { - LOG.warn("Restoring cluster - restoring region servers reported " - + deferred.size() + " errors:"); + LOG.warn("Restoring cluster - restoring region servers reported {} errors:", + deferred.size()); for (int i=0; i fromServers, List toServers, @@ -234,7 +263,7 @@ public class Action { // Ugh. List regions = new LinkedList<>(serverLoad.getRegionMetrics().keySet()); int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size()); - LOG.debug("Removing " + victimRegionCount + " regions from " + sn); + LOG.debug("Removing {} regions from {}", victimRegionCount, sn); for (int i = 0; i < victimRegionCount; ++i) { int victimIx = RandomUtils.nextInt(0, regions.size()); String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx)); @@ -242,8 +271,8 @@ public class Action { } } - LOG.info("Moving " + victimRegions.size() + " regions from " + fromServers.size() - + " servers to " + toServers.size() + " different servers"); + LOG.info("Moving {} regions from {} servers to {} different servers", victimRegions.size(), + fromServers.size(), toServers.size()); Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin(); for (byte[] victimRegion : victimRegions) { // Don't keep moving regions if we're @@ -269,6 +298,15 @@ public class Action { } } + protected void setBalancer(boolean onOrOff, boolean synchronous) throws Exception { + Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin(); + try { + admin.balancerSwitch(onOrOff, synchronous); + } catch (Exception e) { + LOG.warn("Got exception while switching balance ", e); + } + } + public Configuration getConf() { return cluster.getConf(); } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/GracefulRollingRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/GracefulRollingRestartRsAction.java new file mode 100644 index 00000000000..82005bbbd4e --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/GracefulRollingRestartRsAction.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import org.apache.commons.lang3.RandomUtils; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.util.RegionMover; +import org.apache.hadoop.util.Shell; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Gracefully restarts every regionserver in a rolling fashion. At each step, it unloads, + * restarts the loads every rs server sleeping randomly (0-sleepTime) in between servers. + */ +public class GracefulRollingRestartRsAction extends RestartActionBaseAction { + private static final Logger LOG = LoggerFactory.getLogger(GracefulRollingRestartRsAction.class); + + public GracefulRollingRestartRsAction(long sleepTime) { + super(sleepTime); + } + + @Override + public void perform() throws Exception { + LOG.info("Performing action: Rolling restarting non-master region servers"); + List selectedServers = selectServers(); + + LOG.info("Disabling balancer to make unloading possible"); + setBalancer(false, true); + + for (ServerName server : selectedServers) { + String rsName = server.getAddress().toString(); + try (RegionMover rm = + new RegionMover.RegionMoverBuilder(rsName, getConf()).ack(true).build()) { + LOG.info("Unloading {}", server); + rm.unload(); + LOG.info("Restarting {}", server); + gracefulRestartRs(server, sleepTime); + LOG.info("Loading {}", server); + rm.load(); + } catch (Shell.ExitCodeException e) { + LOG.info("Problem restarting but presume successful; code={}", e.getExitCode(), e); + } + sleep(RandomUtils.nextInt(0, (int)sleepTime)); + } + LOG.info("Enabling balancer"); + setBalancer(true, true); + } + + protected List selectServers() throws IOException { + return Arrays.asList(getCurrentServers()); + } + +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java index 6e589aeaa2d..d964d6272d5 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java @@ -45,19 +45,42 @@ public class RestartActionBaseAction extends Action { return; } + LOG.info("Killing master: {}", server); killMaster(server); sleep(sleepTime); + LOG.info("Starting master: {}", server); startMaster(server); } + /** + * Stop and then restart the region server instead of killing it. + * @param server hostname to restart the regionserver on + * @param sleepTime number of milliseconds between stop and restart + * @throws IOException if something goes wrong + */ + void gracefulRestartRs(ServerName server, long sleepTime) throws IOException { + sleepTime = Math.max(sleepTime, 1000); + // Don't try the stop if we're stopping already + if (context.isStopping()) { + return; + } + LOG.info("Stopping region server: {}", server); + stopRs(server); + sleep(sleepTime); + LOG.info("Starting region server: {}", server); + startRs(server); + } + void restartRs(ServerName server, long sleepTime) throws IOException { sleepTime = Math.max(sleepTime, 1000); // Don't try the kill if we're stopping if (context.isStopping()) { return; } + LOG.info("Killing region server: {}", server); killRs(server); sleep(sleepTime); + LOG.info("Starting region server: {}", server); startRs(server); } @@ -67,8 +90,10 @@ public class RestartActionBaseAction extends Action { if (context.isStopping()) { return; } + LOG.info("Killing zookeeper node: {}", server); killZKNode(server); sleep(sleepTime); + LOG.info("Starting zookeeper node: {}", server); startZKNode(server); } @@ -78,8 +103,10 @@ public class RestartActionBaseAction extends Action { if (context.isStopping()) { return; } + LOG.info("Killing data node: {}", server); killDataNode(server); sleep(sleepTime); + LOG.info("Starting data node: {}", server); startDataNode(server); } @@ -89,8 +116,10 @@ public class RestartActionBaseAction extends Action { if (context.isStopping()) { return; } + LOG.info("Killing name node: {}", server); killNameNode(server); sleep(sleepTime); + LOG.info("Starting name node: {}", server); startNameNode(server); } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java new file mode 100644 index 00000000000..d4ad3e40b5f --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java @@ -0,0 +1,117 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; + +import org.apache.commons.lang3.RandomUtils; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.apache.hadoop.hbase.util.Threads; +import org.apache.hadoop.util.Shell; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either + * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter + * maxSuspendedServers limits the maximum number of servers that can be down at the same time + * during rolling restarts. + */ +public class RollingBatchSuspendResumeRsAction extends Action { + private static final Logger LOG = + LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class); + private float ratio; + private long sleepTime; + private int maxSuspendedServers; // number of maximum suspended servers at any given time. + + public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) { + this(sleepTime, ratio, 5); + } + + public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) { + this.ratio = ratio; + this.sleepTime = sleepTime; + this.maxSuspendedServers = maxSuspendedServers; + } + + enum SuspendOrResume { + SUSPEND, RESUME + } + + @Override + public void perform() throws Exception { + LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers", + (int) (ratio * 100))); + List selectedServers = selectServers(); + + Queue serversToBeSuspended = new LinkedList<>(selectedServers); + Queue suspendedServers = new LinkedList<>(); + + // loop while there are servers to be suspended or suspended servers to be resumed + while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context + .isStopping()) { + SuspendOrResume action; + + if (serversToBeSuspended.isEmpty()) { // no more servers to suspend + action = SuspendOrResume.RESUME; + } else if (suspendedServers.isEmpty()) { + action = SuspendOrResume.SUSPEND; // no more servers to resume + } else if (suspendedServers.size() >= maxSuspendedServers) { + // we have too many suspended servers. Don't suspend any more + action = SuspendOrResume.RESUME; + } else { + // do a coin toss + action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME; + } + + ServerName server; + switch (action) { + case SUSPEND: + server = serversToBeSuspended.remove(); + try { + suspendRs(server); + } catch (Shell.ExitCodeException e) { + LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e); + } + suspendedServers.add(server); + break; + case RESUME: + server = suspendedServers.remove(); + try { + resumeRs(server); + } catch (Shell.ExitCodeException e) { + LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e); + } + break; + } + + LOG.info("Sleeping for:{}", sleepTime); + Threads.sleep(sleepTime); + } + } + + protected List selectServers() throws IOException { + return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); + } + +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java index 5657d3962e8..9051e98ff2c 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java @@ -45,6 +45,9 @@ public interface MonkeyConstants { String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period"; String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs"; String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time"; + String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time"; + String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time"; + String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio"; /** * A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky. @@ -75,4 +78,7 @@ public interface MonkeyConstants { long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000; boolean DEFAULT_UNBALANCE_KILL_META_RS = true; long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000; + long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000; + long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000; + float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f; } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java index 4faa786bcd2..2e763adbfd7 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java @@ -21,11 +21,13 @@ package org.apache.hadoop.hbase.chaos.factories; import org.apache.hadoop.hbase.chaos.actions.Action; import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction; +import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction; import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction; import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; @@ -38,8 +40,13 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; */ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { + private long gracefulRollingRestartTSSLeepTime; + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; + @Override public ChaosMonkey build() { + loadProperties(); // Destructive actions to mess things around. Cannot run batch restart. Action[] actions1 = new Action[]{ @@ -48,7 +55,10 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead. new ForceBalancerAction(), new RestartRandomDataNodeAction(60000), - new RestartRandomZKNodeAction(60000) + new RestartRandomZKNodeAction(60000), + new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -62,4 +72,16 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { new PeriodicRandomActionPolicy(60 * 1000, actions1)), new PeriodicRandomActionPolicy(60 * 1000, actions2)); } + + private void loadProperties() { + gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java index 02b59140c28..68d11f9a640 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java @@ -21,9 +21,11 @@ package org.apache.hadoop.hbase.chaos.factories; import org.apache.hadoop.hbase.chaos.actions.Action; import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction; +import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction; import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction; import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; @@ -36,15 +38,23 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; */ public class ServerKillingMonkeyFactory extends MonkeyFactory { + private long gracefulRollingRestartTSSLeepTime; + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; + @Override public ChaosMonkey build() { + loadProperties(); // Destructive actions to mess things around. Cannot run batch restart Action[] actions1 = new Action[] { new RestartRandomRsExceptMetaAction(60000), new RestartActiveMasterAction(5000), new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), //only allow 2 servers to be dead - new ForceBalancerAction() + new ForceBalancerAction(), + new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -58,4 +68,16 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory { new PeriodicRandomActionPolicy(60 * 1000, actions1)), new PeriodicRandomActionPolicy(60 * 1000, actions2)); } + + private void loadProperties() { + gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java index 101a7d5d965..22c35b96b95 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction; import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.FlushTableAction; +import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction; import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction; import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction; @@ -39,6 +40,7 @@ import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction; import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction; import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; import org.apache.hadoop.hbase.chaos.actions.SnapshotTableAction; import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction; @@ -66,6 +68,9 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { private float compactTableRatio; private float compactRandomRegionRatio; private long decreaseHFileSizeSleepTime; + private long gracefulRollingRestartTSSLeepTime; + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; @Override public ChaosMonkey build() { @@ -110,6 +115,9 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime), new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName), new SplitAllRegionOfTableAction(tableName), + new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -179,5 +187,14 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { decreaseHFileSizeSleepTime = Long.parseLong(this.properties.getProperty( MonkeyConstants.DECREASE_HFILE_SIZE_SLEEP_TIME, MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME + "")); + gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java index 2c6bb466428..4e304fbd2a6 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction; import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.FlushTableAction; +import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction; import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction; import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction; @@ -34,6 +35,7 @@ import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction; import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction; import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; @@ -43,8 +45,15 @@ import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy; import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { + + private long gracefulRollingRestartTSSLeepTime; + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; + @Override public ChaosMonkey build() { + loadProperties(); + // Actions that could slow down region movement. // These could also get regions stuck if there are issues. Action[] actions1 = new Action[]{ @@ -72,6 +81,9 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { new SplitAllRegionOfTableAction(tableName), new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME, tableName), + new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -87,4 +99,16 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { new PeriodicRandomActionPolicy(90 * 1000, actions3) ); } + + private void loadProperties() { + gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java index 70636dd3e84..fed51491af3 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java @@ -18,8 +18,9 @@ package org.apache.hadoop.hbase.chaos.monkies; -import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; import org.apache.commons.lang3.RandomUtils; @@ -90,18 +91,13 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey { /** Selects and returns ceil(ratio * items.length) random items from the given array */ public static List selectRandomItems(T[] items, float ratio) { - int remaining = (int)Math.ceil(items.length * ratio); + int selectedNumber = (int)Math.ceil(items.length * ratio); - List selectedItems = new ArrayList<>(remaining); + List originalItems = Arrays.asList(items); + Collections.shuffle(originalItems); - for (int i=0; i 0; i++) { - if (RandomUtils.nextFloat() < ((float)remaining/(items.length-i))) { - selectedItems.add(items[i]); - remaining--; - } - } - - return selectedItems; + int startIndex = RandomUtils.nextInt(0, items.length - selectedNumber); + return originalItems.subList(startIndex, startIndex + selectedNumber); } private Policy[] policies; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index 3a1c8945f2a..85dff357ca6 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -148,6 +148,20 @@ public abstract class HBaseCluster implements Closeable, Configurable { public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException; + /** + * Suspend the region server + * @param serverName the hostname to suspend the regionserver on + * @throws IOException if something goes wrong + */ + public abstract void suspendRegionServer(ServerName serverName) throws IOException; + + /** + * Resume the region server + * @param serverName the hostname to resume the regionserver on + * @throws IOException if something goes wrong + */ + public abstract void resumeRegionServer(ServerName serverName) throws IOException; + /** * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, * silently logs warning message. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 99dca1df66b..948ba1f2c4e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -291,6 +291,16 @@ public class MiniHBaseCluster extends HBaseCluster { stopRegionServer(getRegionServerIndex(serverName)); } + @Override + public void suspendRegionServer(ServerName serverName) throws IOException { + suspendRegionServer(getRegionServerIndex(serverName)); + } + + @Override + public void resumeRegionServer(ServerName serverName) throws IOException { + resumeRegionServer(getRegionServerIndex(serverName)); + } + @Override public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException { //ignore timeout for now @@ -489,6 +499,32 @@ public class MiniHBaseCluster extends HBaseCluster { return server; } + /** + * Suspend the specified region server + * @param serverNumber Used as index into a list. + * @return + */ + public JVMClusterUtil.RegionServerThread suspendRegionServer(int serverNumber) { + JVMClusterUtil.RegionServerThread server = + hbaseCluster.getRegionServers().get(serverNumber); + LOG.info("Suspending {}", server.toString()); + server.suspend(); + return server; + } + + /** + * Resume the specified region server + * @param serverNumber Used as index into a list. + * @return + */ + public JVMClusterUtil.RegionServerThread resumeRegionServer(int serverNumber) { + JVMClusterUtil.RegionServerThread server = + hbaseCluster.getRegionServers().get(serverNumber); + LOG.info("Resuming {}", server.toString()); + server.resume(); + return server; + } + /** * Wait for the specified region server to stop. Removes this thread from list * of running threads.