From 186373bea43925c7bd35df83ef068d9378fc2426 Mon Sep 17 00:00:00 2001 From: BukrosSzabolcs <45901354+BukrosSzabolcs@users.noreply.github.com> Date: Thu, 26 Sep 2019 10:07:38 +0200 Subject: [PATCH] HBASE-22982: region server suspend/resume (#592) * Add chaos monkey action for suspend/resume region servers * Add these to relevant chaos monkeys branch-1-backport-note: Graceful regionserver restart action wasn't backported due to a dependency of "RegionMover" script. Can be done later if needed. Signed-off-by: Balazs Meszaros Signed-off-by: Peter Somogyi --- .../hadoop/hbase/DistributedHBaseCluster.java | 32 +++-- .../hadoop/hbase/chaos/actions/Action.java | 78 +++++++++--- .../actions/RestartActionBaseAction.java | 29 +++++ .../RollingBatchSuspendResumeRsAction.java | 120 ++++++++++++++++++ .../chaos/factories/MonkeyConstants.java | 6 + ...erAndDependenciesKillingMonkeyFactory.java | 18 ++- .../factories/ServerKillingMonkeyFactory.java | 18 ++- .../SlowDeterministicMonkeyFactory.java | 37 +++++- .../StressAssignmentManagerMonkeyFactory.java | 38 +++++- .../chaos/monkies/PolicyBasedChaosMonkey.java | 19 +-- .../org/apache/hadoop/hbase/HBaseCluster.java | 14 ++ .../apache/hadoop/hbase/MiniHBaseCluster.java | 36 ++++++ 12 files changed, 401 insertions(+), 44 deletions(-) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index b477f76d133..27751ea7124 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -129,6 +129,20 @@ public class DistributedHBaseCluster extends HBaseCluster { waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout); } + @Override + public void suspendRegionServer(ServerName serverName) throws IOException { + LOG.info("Suspend RS: " + serverName.getServerName()); + clusterManager.suspend(ServiceType.HBASE_REGIONSERVER, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void resumeRegionServer(ServerName serverName) throws IOException { + LOG.info("Resume RS: " + serverName.getServerName()); + clusterManager.resume(ServiceType.HBASE_REGIONSERVER, + serverName.getHostname(), serverName.getPort()); + } + @Override public void startZkNode(String hostname, int port) throws IOException { LOG.info("Starting Zookeeper node on: " + hostname); @@ -206,7 +220,7 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void stopNameNode(ServerName serverName) throws IOException { - LOG.info("Stopping name node on: " + serverName.getServerName()); + LOG.info(String.format("Stopping name node on: %s", serverName.getServerName())); clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), serverName.getPort()); } @@ -223,7 +237,8 @@ public class DistributedHBaseCluster extends HBaseCluster { private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) throws IOException { - LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); + LOG.info( + String.format("Waiting for service: %s to stop: %s", service, serverName.getServerName())); long start = System.currentTimeMillis(); while ((System.currentTimeMillis() - start) < timeout) { @@ -237,7 +252,8 @@ public class DistributedHBaseCluster extends HBaseCluster { private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout) throws IOException { - LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName()); + LOG.info(String.format( + "Waiting for service: %s to start: ", service, serverName.getServerName())); long start = System.currentTimeMillis(); while ((System.currentTimeMillis() - start) < timeout) { @@ -258,7 +274,7 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void startMaster(String hostname, int port) throws IOException { - LOG.info("Starting Master on: " + hostname + ":" + port); + LOG.info(String.format("Starting Master on: %s:%s", hostname, port)); clusterManager.start(ServiceType.HBASE_MASTER, hostname, port); } @@ -437,8 +453,8 @@ public class DistributedHBaseCluster extends HBaseCluster { } } if (!deferred.isEmpty()) { - LOG.warn("Restoring cluster - restoring region servers reported " - + deferred.size() + " errors:"); + LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:", + deferred.size())); for (int i=0; i selectedServers = selectServers(); + + Queue serversToBeSuspended = new LinkedList<>(selectedServers); + Queue suspendedServers = new LinkedList<>(); + + // loop while there are servers to be suspended or suspended servers to be resumed + while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context + .isStopping()) { + SuspendOrResume action; + + if (serversToBeSuspended.isEmpty()) { // no more servers to suspend + action = SuspendOrResume.RESUME; + } else if (suspendedServers.isEmpty()) { + action = SuspendOrResume.SUSPEND; // no more servers to resume + } else if (suspendedServers.size() >= maxSuspendedServers) { + // we have too many suspended servers. Don't suspend any more + action = SuspendOrResume.RESUME; + } else { + // do a coin toss + action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME; + } + + ServerName server; + switch (action) { + case SUSPEND: + server = serversToBeSuspended.remove(); + try { + suspendRs(server); + } catch (Shell.ExitCodeException e) { + LOG.warn("Problem suspending but presume successful; code=" + e.getExitCode(), e); + } + suspendedServers.add(server); + break; + case RESUME: + server = suspendedServers.remove(); + try { + resumeRs(server); + } catch (Shell.ExitCodeException e) { + LOG.info("Problem resuming, will retry; code= " + e.getExitCode(), e); + } + break; + default: + throw new IllegalArgumentException( + "Encountered unexpected action type: " + action.name()); + } + + LOG.info("Sleeping for: " + sleepTime); + Threads.sleep(sleepTime); + } + } + + protected List selectServers() throws IOException { + return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); + } + +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java index 5657d3962e8..9051e98ff2c 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java @@ -45,6 +45,9 @@ public interface MonkeyConstants { String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period"; String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs"; String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time"; + String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time"; + String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time"; + String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio"; /** * A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky. @@ -75,4 +78,7 @@ public interface MonkeyConstants { long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000; boolean DEFAULT_UNBALANCE_KILL_META_RS = true; long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000; + long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000; + long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000; + float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f; } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java index 4faa786bcd2..1b1eef03b31 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction; import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; @@ -38,8 +39,12 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; */ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; + @Override public ChaosMonkey build() { + loadProperties(); // Destructive actions to mess things around. Cannot run batch restart. Action[] actions1 = new Action[]{ @@ -48,7 +53,9 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead. new ForceBalancerAction(), new RestartRandomDataNodeAction(60000), - new RestartRandomZKNodeAction(60000) + new RestartRandomZKNodeAction(60000), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -62,4 +69,13 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { new PeriodicRandomActionPolicy(60 * 1000, actions1)), new PeriodicRandomActionPolicy(60 * 1000, actions2)); } + + private void loadProperties() { + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java index 02b59140c28..360abb89d67 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction; import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction; import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; @@ -36,15 +37,21 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; */ public class ServerKillingMonkeyFactory extends MonkeyFactory { + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; + @Override public ChaosMonkey build() { + loadProperties(); // Destructive actions to mess things around. Cannot run batch restart Action[] actions1 = new Action[] { new RestartRandomRsExceptMetaAction(60000), new RestartActiveMasterAction(5000), new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), //only allow 2 servers to be dead - new ForceBalancerAction() + new ForceBalancerAction(), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -58,4 +65,13 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory { new PeriodicRandomActionPolicy(60 * 1000, actions1)), new PeriodicRandomActionPolicy(60 * 1000, actions2)); } + + private void loadProperties() { + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java index 093df2aa7a5..76e087791d9 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java @@ -18,7 +18,32 @@ package org.apache.hadoop.hbase.chaos.factories; -import org.apache.hadoop.hbase.chaos.actions.*; +import org.apache.hadoop.hbase.chaos.actions.Action; +import org.apache.hadoop.hbase.chaos.actions.AddColumnAction; +import org.apache.hadoop.hbase.chaos.actions.BatchRestartRsAction; +import org.apache.hadoop.hbase.chaos.actions.ChangeBloomFilterAction; +import org.apache.hadoop.hbase.chaos.actions.ChangeCompressionAction; +import org.apache.hadoop.hbase.chaos.actions.ChangeEncodingAction; +import org.apache.hadoop.hbase.chaos.actions.ChangeSplitPolicyAction; +import org.apache.hadoop.hbase.chaos.actions.ChangeVersionsAction; +import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.CompactTableAction; +import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction; +import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; +import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.FlushTableAction; +import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction; +import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; +import org.apache.hadoop.hbase.chaos.actions.SnapshotTableAction; +import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; @@ -44,6 +69,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { private float compactTableRatio; private float compactRandomRegionRatio; private long decreaseHFileSizeSleepTime; + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; @Override public ChaosMonkey build() { @@ -89,6 +116,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime), new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName), new SplitAllRegionOfTableAction(tableName), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -158,5 +187,11 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { decreaseHFileSizeSleepTime = Long.parseLong(this.properties.getProperty( MonkeyConstants.DECREASE_HFILE_SIZE_SLEEP_TIME, MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME + "")); + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java index 03471ab1bd1..7f8fb2651b4 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java @@ -17,7 +17,26 @@ */ package org.apache.hadoop.hbase.chaos.factories; -import org.apache.hadoop.hbase.chaos.actions.*; +import org.apache.hadoop.hbase.chaos.actions.Action; +import org.apache.hadoop.hbase.chaos.actions.AddColumnAction; +import org.apache.hadoop.hbase.chaos.actions.BatchRestartRsAction; +import org.apache.hadoop.hbase.chaos.actions.ChangeSplitPolicyAction; +import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.CompactTableAction; +import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction; +import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; +import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.FlushTableAction; +import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction; +import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction; +import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; @@ -25,9 +44,15 @@ import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy; import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { + + private long rollingBatchSuspendRSSleepTime; + private float rollingBatchSuspendtRSRatio; + @Override public ChaosMonkey build() { + loadProperties(); + // Actions that could slow down region movement. // These could also get regions stuck if there are issues. Action[] actions1 = new Action[]{ @@ -55,6 +80,8 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { new SplitAllRegionOfTableAction(tableName), new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME, tableName), + new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime, + rollingBatchSuspendtRSRatio) }; // Action to log more info for debugging @@ -70,4 +97,13 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { new PeriodicRandomActionPolicy(90 * 1000, actions3) ); } + + private void loadProperties() { + rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java index 57f7c836495..277e221d0e7 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java @@ -18,8 +18,9 @@ package org.apache.hadoop.hbase.chaos.monkies; -import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; import org.apache.commons.lang.math.RandomUtils; @@ -90,18 +91,12 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey { /** Selects and returns ceil(ratio * items.length) random items from the given array */ public static List selectRandomItems(T[] items, float ratio) { - int remaining = (int)Math.ceil(items.length * ratio); + int selectedNumber = (int)Math.ceil(items.length * ratio); - List selectedItems = new ArrayList(remaining); - - for (int i=0; i 0; i++) { - if (RandomUtils.nextFloat() < ((float)remaining/(items.length-i))) { - selectedItems.add(items[i]); - remaining--; - } - } - - return selectedItems; + List originalItems = Arrays.asList(items); + Collections.shuffle(originalItems); + int startIndex = RandomUtils.nextInt(items.length - selectedNumber); + return originalItems.subList(startIndex, startIndex + selectedNumber); } private Policy[] policies; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index 7de1ecf10c9..001ab2c048c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -160,6 +160,20 @@ public abstract class HBaseCluster implements Closeable, Configurable { public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException; + /** + * Suspend the region server + * @param serverName the hostname to suspend the regionserver on + * @throws IOException if something goes wrong + */ + public abstract void suspendRegionServer(ServerName serverName) throws IOException; + + /** + * Resume the region server + * @param serverName the hostname to resume the regionserver on + * @throws IOException if something goes wrong + */ + public abstract void resumeRegionServer(ServerName serverName) throws IOException; + /** * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, * silently logs warning message. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 7792be37af2..714e72b41ed 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -258,6 +258,16 @@ public class MiniHBaseCluster extends HBaseCluster { stopRegionServer(getRegionServerIndex(serverName)); } + @Override + public void suspendRegionServer(ServerName serverName) throws IOException { + suspendRegionServer(getRegionServerIndex(serverName)); + } + + @Override + public void resumeRegionServer(ServerName serverName) throws IOException { + resumeRegionServer(getRegionServerIndex(serverName)); + } + @Override public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException { //ignore timeout for now @@ -423,6 +433,32 @@ public class MiniHBaseCluster extends HBaseCluster { return server; } + /** + * Suspend the specified region server + * @param serverNumber Used as index into a list. + * @return server instance that was suspended. + */ + public JVMClusterUtil.RegionServerThread suspendRegionServer(int serverNumber) { + JVMClusterUtil.RegionServerThread server = + hbaseCluster.getRegionServers().get(serverNumber); + LOG.info("Suspending " + server.toString()); + server.suspend(); + return server; + } + + /** + * Resume the specified region server + * @param serverNumber Used as index into a list. + * @return server instance that was resumed. + */ + public JVMClusterUtil.RegionServerThread resumeRegionServer(int serverNumber) { + JVMClusterUtil.RegionServerThread server = + hbaseCluster.getRegionServers().get(serverNumber); + LOG.info("Resuming " + server.toString()); + server.resume(); + return server; + } + /** * Wait for the specified region server to stop. Removes this thread from list * of running threads.