HBASE-22982: region server suspend/resume (#592)
* Add chaos monkey action for suspend/resume region servers * Add these to relevant chaos monkeys branch-1-backport-note: Graceful regionserver restart action wasn't backported due to a dependency of "RegionMover" script. Can be done later if needed. Signed-off-by: Balazs Meszaros <meszibalu@apache.org> Signed-off-by: Peter Somogyi <psomogyi@apache.org>
This commit is contained in:
parent
655658ce36
commit
186373bea4
|
@ -129,6 +129,20 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void suspendRegionServer(ServerName serverName) throws IOException {
|
||||
LOG.info("Suspend RS: " + serverName.getServerName());
|
||||
clusterManager.suspend(ServiceType.HBASE_REGIONSERVER,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resumeRegionServer(ServerName serverName) throws IOException {
|
||||
LOG.info("Resume RS: " + serverName.getServerName());
|
||||
clusterManager.resume(ServiceType.HBASE_REGIONSERVER,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startZkNode(String hostname, int port) throws IOException {
|
||||
LOG.info("Starting Zookeeper node on: " + hostname);
|
||||
|
@ -206,7 +220,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
|
||||
@Override
|
||||
public void stopNameNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Stopping name node on: " + serverName.getServerName());
|
||||
LOG.info(String.format("Stopping name node on: %s", serverName.getServerName()));
|
||||
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||
serverName.getPort());
|
||||
}
|
||||
|
@ -223,7 +237,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
|
||||
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
||||
throws IOException {
|
||||
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
|
||||
LOG.info(
|
||||
String.format("Waiting for service: %s to stop: %s", service, serverName.getServerName()));
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
while ((System.currentTimeMillis() - start) < timeout) {
|
||||
|
@ -237,7 +252,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
|
||||
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
|
||||
throws IOException {
|
||||
LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
|
||||
LOG.info(String.format(
|
||||
"Waiting for service: %s to start: ", service, serverName.getServerName()));
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
while ((System.currentTimeMillis() - start) < timeout) {
|
||||
|
@ -258,7 +274,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
|
||||
@Override
|
||||
public void startMaster(String hostname, int port) throws IOException {
|
||||
LOG.info("Starting Master on: " + hostname + ":" + port);
|
||||
LOG.info(String.format("Starting Master on: %s:%s", hostname, port));
|
||||
clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
|
||||
}
|
||||
|
||||
|
@ -437,8 +453,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
}
|
||||
}
|
||||
if (!deferred.isEmpty()) {
|
||||
LOG.warn("Restoring cluster - restoring region servers reported "
|
||||
+ deferred.size() + " errors:");
|
||||
LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
|
||||
deferred.size()));
|
||||
for (int i=0; i<deferred.size() && i < 3; i++) {
|
||||
LOG.warn(deferred.get(i));
|
||||
}
|
||||
|
@ -500,8 +516,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
}
|
||||
}
|
||||
if (!deferred.isEmpty()) {
|
||||
LOG.warn("Restoring cluster - restoring region servers reported "
|
||||
+ deferred.size() + " errors:");
|
||||
LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
|
||||
deferred.size()));
|
||||
for (int i=0; i<deferred.size() && i < 3; i++) {
|
||||
LOG.warn(deferred.get(i));
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.hadoop.hbase.ClusterStatus;
|
|||
import org.apache.hadoop.hbase.HBaseCluster;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.IntegrationTestingUtility;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.ServerLoad;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
|
@ -154,65 +155,93 @@ public class Action {
|
|||
LOG.info("Started master: " + server);
|
||||
}
|
||||
|
||||
protected void stopRs(ServerName server) throws IOException {
|
||||
LOG.info("Stopping regionserver " + server);
|
||||
cluster.stopRegionServer(server);
|
||||
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||
LOG.info(String.format("Stoppiong regionserver %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void suspendRs(ServerName server) throws IOException {
|
||||
LOG.info("Suspending regionserver %s" + server);
|
||||
cluster.suspendRegionServer(server);
|
||||
if(!(cluster instanceof MiniHBaseCluster)){
|
||||
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||
}
|
||||
LOG.info(String.format("Suspending regionserver %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void resumeRs(ServerName server) throws IOException {
|
||||
LOG.info("Resuming regionserver " + server);
|
||||
cluster.resumeRegionServer(server);
|
||||
if(!(cluster instanceof MiniHBaseCluster)){
|
||||
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
|
||||
}
|
||||
LOG.info(String.format("Resuming regionserver %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void killRs(ServerName server) throws IOException {
|
||||
LOG.info("Killing region server:" + server);
|
||||
LOG.info("Killing regionserver " + server);
|
||||
cluster.killRegionServer(server);
|
||||
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||
LOG.info("Killed region server:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
LOG.info(String.format("Killed regionserver %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void startRs(ServerName server) throws IOException {
|
||||
LOG.info("Starting region server:" + server.getHostname());
|
||||
LOG.info("Starting regionserver " + server.getAddress());
|
||||
cluster.startRegionServer(server.getHostname(), server.getPort());
|
||||
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
|
||||
LOG.info("Started region server:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
LOG.info(String.format("Started regionserver %s. Reported num of rs: %s", server.getAddress(),
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void killZKNode(ServerName server) throws IOException {
|
||||
LOG.info("Killing zookeeper node:" + server);
|
||||
LOG.info("Killing zookeeper node " + server);
|
||||
cluster.killZkNode(server);
|
||||
cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
|
||||
LOG.info("Killed zookeeper node:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
LOG.info(String.format("Killed zookeeper node %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void startZKNode(ServerName server) throws IOException {
|
||||
LOG.info("Starting zookeeper node:" + server.getHostname());
|
||||
LOG.info("Starting zookeeper node " + server.getHostname());
|
||||
cluster.startZkNode(server.getHostname(), server.getPort());
|
||||
cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
|
||||
LOG.info("Started zookeeper node:" + server);
|
||||
LOG.info("Started zookeeper node " + server);
|
||||
}
|
||||
|
||||
protected void killDataNode(ServerName server) throws IOException {
|
||||
LOG.info("Killing datanode:" + server);
|
||||
LOG.info("Killing datanode " + server);
|
||||
cluster.killDataNode(server);
|
||||
cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
|
||||
LOG.info("Killed datanode:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
LOG.info(String.format("Killed datanode %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
protected void startDataNode(ServerName server) throws IOException {
|
||||
LOG.info("Starting datanode:" + server.getHostname());
|
||||
LOG.info("Starting datanode " + server.getHostname());
|
||||
cluster.startDataNode(server);
|
||||
cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
|
||||
LOG.info("Started datanode:" + server);
|
||||
LOG.info("Started datanode " + server);
|
||||
}
|
||||
|
||||
protected void killNameNode(ServerName server) throws IOException {
|
||||
LOG.info("Killing namenode :-" + server.getHostname());
|
||||
LOG.info("Killing namenode : " + server.getHostname());
|
||||
cluster.killNameNode(server);
|
||||
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
|
||||
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
|
||||
LOG.info("Killed namenode: " + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
|
||||
protected void startNameNode(ServerName server) throws IOException {
|
||||
LOG.info("Starting Namenode :-" + server.getHostname());
|
||||
LOG.info("Starting Namenode : " + server.getHostname());
|
||||
cluster.startNameNode(server);
|
||||
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
|
||||
LOG.info("Started namenode:" + server);
|
||||
LOG.info("Started namenode: " + server);
|
||||
}
|
||||
|
||||
protected void unbalanceRegions(ClusterStatus clusterStatus,
|
||||
|
@ -259,6 +288,15 @@ public class Action {
|
|||
}
|
||||
}
|
||||
|
||||
protected void setBalancer(boolean onOrOff, boolean synchronous) throws Exception {
|
||||
Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
|
||||
try {
|
||||
admin.setBalancerRunning(onOrOff, synchronous);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("Got exception while switching balance ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Configuration getConf() {
|
||||
return cluster.getConf();
|
||||
}
|
||||
|
|
|
@ -49,19 +49,42 @@ public class RestartActionBaseAction extends Action {
|
|||
return;
|
||||
}
|
||||
|
||||
LOG.info("Killing master: " + server);
|
||||
killMaster(server);
|
||||
sleep(sleepTime);
|
||||
LOG.info("Starting master: " + server);
|
||||
startMaster(server);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop and then restart the region server instead of killing it.
|
||||
* @param server hostname to restart the regionserver on
|
||||
* @param sleepTime number of milliseconds between stop and restart
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
void gracefulRestartRs(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the stop if we're stopping already
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Stopping region server: " + server);
|
||||
stopRs(server);
|
||||
sleep(sleepTime);
|
||||
LOG.info("Starting region server: " + server);
|
||||
startRs(server);
|
||||
}
|
||||
|
||||
void restartRs(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the kill if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Killing region server: " + server);
|
||||
killRs(server);
|
||||
sleep(sleepTime);
|
||||
LOG.info("Starting region server: " + server);
|
||||
startRs(server);
|
||||
}
|
||||
|
||||
|
@ -71,8 +94,10 @@ public class RestartActionBaseAction extends Action {
|
|||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Killing zookeeper node: " + server);
|
||||
killZKNode(server);
|
||||
sleep(sleepTime);
|
||||
LOG.info("Starting zookeeper node: " + server);
|
||||
startZKNode(server);
|
||||
}
|
||||
|
||||
|
@ -82,8 +107,10 @@ public class RestartActionBaseAction extends Action {
|
|||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Killing data node: " + server);
|
||||
killDataNode(server);
|
||||
sleep(sleepTime);
|
||||
LOG.info("Starting data node: " + server);
|
||||
startDataNode(server);
|
||||
}
|
||||
|
||||
|
@ -93,8 +120,10 @@ public class RestartActionBaseAction extends Action {
|
|||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Killing name node: " + server);
|
||||
killNameNode(server);
|
||||
sleep(sleepTime);
|
||||
LOG.info("Starting name node: " + server);
|
||||
startNameNode(server);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
|
||||
import org.apache.commons.lang.math.RandomUtils;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either
|
||||
* suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter
|
||||
* maxSuspendedServers limits the maximum number of servers that can be down at the same time
|
||||
* during rolling restarts.
|
||||
*/
|
||||
public class RollingBatchSuspendResumeRsAction extends Action {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
|
||||
private float ratio;
|
||||
private long sleepTime;
|
||||
private int maxSuspendedServers; // number of maximum suspended servers at any given time.
|
||||
|
||||
public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
|
||||
this(sleepTime, ratio, 5);
|
||||
}
|
||||
|
||||
public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
|
||||
this.ratio = ratio;
|
||||
this.sleepTime = sleepTime;
|
||||
this.maxSuspendedServers = maxSuspendedServers;
|
||||
}
|
||||
|
||||
enum SuspendOrResume {
|
||||
SUSPEND, RESUME
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
|
||||
(int) (ratio * 100)));
|
||||
List<ServerName> selectedServers = selectServers();
|
||||
|
||||
Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers);
|
||||
Queue<ServerName> suspendedServers = new LinkedList<>();
|
||||
|
||||
// loop while there are servers to be suspended or suspended servers to be resumed
|
||||
while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context
|
||||
.isStopping()) {
|
||||
SuspendOrResume action;
|
||||
|
||||
if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
|
||||
action = SuspendOrResume.RESUME;
|
||||
} else if (suspendedServers.isEmpty()) {
|
||||
action = SuspendOrResume.SUSPEND; // no more servers to resume
|
||||
} else if (suspendedServers.size() >= maxSuspendedServers) {
|
||||
// we have too many suspended servers. Don't suspend any more
|
||||
action = SuspendOrResume.RESUME;
|
||||
} else {
|
||||
// do a coin toss
|
||||
action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
|
||||
}
|
||||
|
||||
ServerName server;
|
||||
switch (action) {
|
||||
case SUSPEND:
|
||||
server = serversToBeSuspended.remove();
|
||||
try {
|
||||
suspendRs(server);
|
||||
} catch (Shell.ExitCodeException e) {
|
||||
LOG.warn("Problem suspending but presume successful; code=" + e.getExitCode(), e);
|
||||
}
|
||||
suspendedServers.add(server);
|
||||
break;
|
||||
case RESUME:
|
||||
server = suspendedServers.remove();
|
||||
try {
|
||||
resumeRs(server);
|
||||
} catch (Shell.ExitCodeException e) {
|
||||
LOG.info("Problem resuming, will retry; code= " + e.getExitCode(), e);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"Encountered unexpected action type: " + action.name());
|
||||
}
|
||||
|
||||
LOG.info("Sleeping for: " + sleepTime);
|
||||
Threads.sleep(sleepTime);
|
||||
}
|
||||
}
|
||||
|
||||
protected List<ServerName> selectServers() throws IOException {
|
||||
return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
|
||||
}
|
||||
|
||||
}
|
|
@ -45,6 +45,9 @@ public interface MonkeyConstants {
|
|||
String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period";
|
||||
String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs";
|
||||
String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time";
|
||||
String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time";
|
||||
String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time";
|
||||
String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio";
|
||||
|
||||
/**
|
||||
* A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky.
|
||||
|
@ -75,4 +78,7 @@ public interface MonkeyConstants {
|
|||
long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000;
|
||||
boolean DEFAULT_UNBALANCE_KILL_META_RS = true;
|
||||
long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000;
|
||||
long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000;
|
||||
long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000;
|
||||
float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
|
|||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||
|
@ -38,8 +39,12 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
|||
*/
|
||||
public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
||||
|
||||
private long rollingBatchSuspendRSSleepTime;
|
||||
private float rollingBatchSuspendtRSRatio;
|
||||
|
||||
@Override
|
||||
public ChaosMonkey build() {
|
||||
loadProperties();
|
||||
|
||||
// Destructive actions to mess things around. Cannot run batch restart.
|
||||
Action[] actions1 = new Action[]{
|
||||
|
@ -48,7 +53,9 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead.
|
||||
new ForceBalancerAction(),
|
||||
new RestartRandomDataNodeAction(60000),
|
||||
new RestartRandomZKNodeAction(60000)
|
||||
new RestartRandomZKNodeAction(60000),
|
||||
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||
rollingBatchSuspendtRSRatio)
|
||||
};
|
||||
|
||||
// Action to log more info for debugging
|
||||
|
@ -62,4 +69,13 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
||||
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
||||
}
|
||||
|
||||
private void loadProperties() {
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
|||
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||
|
@ -36,15 +37,21 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
|||
*/
|
||||
public class ServerKillingMonkeyFactory extends MonkeyFactory {
|
||||
|
||||
private long rollingBatchSuspendRSSleepTime;
|
||||
private float rollingBatchSuspendtRSRatio;
|
||||
|
||||
@Override
|
||||
public ChaosMonkey build() {
|
||||
loadProperties();
|
||||
|
||||
// Destructive actions to mess things around. Cannot run batch restart
|
||||
Action[] actions1 = new Action[] {
|
||||
new RestartRandomRsExceptMetaAction(60000),
|
||||
new RestartActiveMasterAction(5000),
|
||||
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), //only allow 2 servers to be dead
|
||||
new ForceBalancerAction()
|
||||
new ForceBalancerAction(),
|
||||
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||
rollingBatchSuspendtRSRatio)
|
||||
};
|
||||
|
||||
// Action to log more info for debugging
|
||||
|
@ -58,4 +65,13 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory {
|
|||
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
||||
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
||||
}
|
||||
|
||||
private void loadProperties() {
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,32 @@
|
|||
|
||||
package org.apache.hadoop.hbase.chaos.factories;
|
||||
|
||||
import org.apache.hadoop.hbase.chaos.actions.*;
|
||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||
import org.apache.hadoop.hbase.chaos.actions.AddColumnAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.BatchRestartRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ChangeBloomFilterAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ChangeCompressionAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ChangeEncodingAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ChangeSplitPolicyAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ChangeVersionsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.CompactTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.SnapshotTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||
|
@ -44,6 +69,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||
private float compactTableRatio;
|
||||
private float compactRandomRegionRatio;
|
||||
private long decreaseHFileSizeSleepTime;
|
||||
private long rollingBatchSuspendRSSleepTime;
|
||||
private float rollingBatchSuspendtRSRatio;
|
||||
|
||||
@Override
|
||||
public ChaosMonkey build() {
|
||||
|
@ -89,6 +116,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||
new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime),
|
||||
new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName),
|
||||
new SplitAllRegionOfTableAction(tableName),
|
||||
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||
rollingBatchSuspendtRSRatio)
|
||||
};
|
||||
|
||||
// Action to log more info for debugging
|
||||
|
@ -158,5 +187,11 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||
decreaseHFileSizeSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.DECREASE_HFILE_SIZE_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,26 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.chaos.factories;
|
||||
|
||||
import org.apache.hadoop.hbase.chaos.actions.*;
|
||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||
import org.apache.hadoop.hbase.chaos.actions.AddColumnAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.BatchRestartRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ChangeSplitPolicyAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.CompactTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||
|
@ -25,9 +44,15 @@ import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy;
|
|||
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
||||
|
||||
public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
||||
|
||||
private long rollingBatchSuspendRSSleepTime;
|
||||
private float rollingBatchSuspendtRSRatio;
|
||||
|
||||
@Override
|
||||
public ChaosMonkey build() {
|
||||
|
||||
loadProperties();
|
||||
|
||||
// Actions that could slow down region movement.
|
||||
// These could also get regions stuck if there are issues.
|
||||
Action[] actions1 = new Action[]{
|
||||
|
@ -55,6 +80,8 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
|||
new SplitAllRegionOfTableAction(tableName),
|
||||
new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME,
|
||||
tableName),
|
||||
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||
rollingBatchSuspendtRSRatio)
|
||||
};
|
||||
|
||||
// Action to log more info for debugging
|
||||
|
@ -70,4 +97,13 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
|||
new PeriodicRandomActionPolicy(90 * 1000, actions3)
|
||||
);
|
||||
}
|
||||
|
||||
private void loadProperties() {
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,8 +18,9 @@
|
|||
|
||||
package org.apache.hadoop.hbase.chaos.monkies;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang.math.RandomUtils;
|
||||
|
@ -90,18 +91,12 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey {
|
|||
|
||||
/** Selects and returns ceil(ratio * items.length) random items from the given array */
|
||||
public static <T> List<T> selectRandomItems(T[] items, float ratio) {
|
||||
int remaining = (int)Math.ceil(items.length * ratio);
|
||||
int selectedNumber = (int)Math.ceil(items.length * ratio);
|
||||
|
||||
List<T> selectedItems = new ArrayList<T>(remaining);
|
||||
|
||||
for (int i=0; i<items.length && remaining > 0; i++) {
|
||||
if (RandomUtils.nextFloat() < ((float)remaining/(items.length-i))) {
|
||||
selectedItems.add(items[i]);
|
||||
remaining--;
|
||||
}
|
||||
}
|
||||
|
||||
return selectedItems;
|
||||
List<T> originalItems = Arrays.asList(items);
|
||||
Collections.shuffle(originalItems);
|
||||
int startIndex = RandomUtils.nextInt(items.length - selectedNumber);
|
||||
return originalItems.subList(startIndex, startIndex + selectedNumber);
|
||||
}
|
||||
|
||||
private Policy[] policies;
|
||||
|
|
|
@ -160,6 +160,20 @@ public abstract class HBaseCluster implements Closeable, Configurable {
|
|||
public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Suspend the region server
|
||||
* @param serverName the hostname to suspend the regionserver on
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void suspendRegionServer(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Resume the region server
|
||||
* @param serverName the hostname to resume the regionserver on
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void resumeRegionServer(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
|
||||
* silently logs warning message.
|
||||
|
|
|
@ -258,6 +258,16 @@ public class MiniHBaseCluster extends HBaseCluster {
|
|||
stopRegionServer(getRegionServerIndex(serverName));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void suspendRegionServer(ServerName serverName) throws IOException {
|
||||
suspendRegionServer(getRegionServerIndex(serverName));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resumeRegionServer(ServerName serverName) throws IOException {
|
||||
resumeRegionServer(getRegionServerIndex(serverName));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
|
||||
//ignore timeout for now
|
||||
|
@ -423,6 +433,32 @@ public class MiniHBaseCluster extends HBaseCluster {
|
|||
return server;
|
||||
}
|
||||
|
||||
/**
|
||||
* Suspend the specified region server
|
||||
* @param serverNumber Used as index into a list.
|
||||
* @return server instance that was suspended.
|
||||
*/
|
||||
public JVMClusterUtil.RegionServerThread suspendRegionServer(int serverNumber) {
|
||||
JVMClusterUtil.RegionServerThread server =
|
||||
hbaseCluster.getRegionServers().get(serverNumber);
|
||||
LOG.info("Suspending " + server.toString());
|
||||
server.suspend();
|
||||
return server;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resume the specified region server
|
||||
* @param serverNumber Used as index into a list.
|
||||
* @return server instance that was resumed.
|
||||
*/
|
||||
public JVMClusterUtil.RegionServerThread resumeRegionServer(int serverNumber) {
|
||||
JVMClusterUtil.RegionServerThread server =
|
||||
hbaseCluster.getRegionServers().get(serverNumber);
|
||||
LOG.info("Resuming " + server.toString());
|
||||
server.resume();
|
||||
return server;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for the specified region server to stop. Removes this thread from list
|
||||
* of running threads.
|
||||
|
|
Loading…
Reference in New Issue