HBASE-23085 Network and Data related Actions
Add monkey actions: - manipulate network packages with tc (reorder, loose,...) - add CPU load - fill the disk - corrupt or delete regionserver data files Extend HBaseClusterManager to allow sudo calls. Signed-off-by: Josh Elser <elserj@apache.org> Signed-off-by: Balazs Meszaros <meszibalu@apache.org>
This commit is contained in:
parent
f0f7fae400
commit
d2142a8ebb
|
@ -62,6 +62,15 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
"timeout 30 /usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\"";
|
||||
private String tunnelCmd;
|
||||
|
||||
/**
|
||||
* The command format that is used to execute the remote command with sudo. Arguments:
|
||||
* 1 SSH options, 2 user name , 3 "@" if username is set, 4 host,
|
||||
* 5 original command, 6 timeout.
|
||||
*/
|
||||
private static final String DEFAULT_TUNNEL_SUDO_CMD =
|
||||
"timeout %6$s /usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo %5$s\"";
|
||||
private String tunnelSudoCmd;
|
||||
|
||||
private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
|
||||
private static final int DEFAULT_RETRY_ATTEMPTS = 5;
|
||||
|
||||
|
@ -86,6 +95,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
sshOptions = (sshOptions == null) ? "" : sshOptions;
|
||||
sshUserName = (sshUserName == null) ? "" : sshUserName;
|
||||
tunnelCmd = conf.get("hbase.it.clustermanager.ssh.cmd", DEFAULT_TUNNEL_CMD);
|
||||
tunnelSudoCmd = conf.get("hbase.it.clustermanager.ssh.sudo.cmd", DEFAULT_TUNNEL_SUDO_CMD);
|
||||
// Print out ssh special config if any.
|
||||
if ((sshUserName != null && sshUserName.length() > 0) ||
|
||||
(sshOptions != null && sshOptions.length() > 0)) {
|
||||
|
@ -152,10 +162,32 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
LOG.info("Executing full command [" + cmd + "]");
|
||||
return new String[] { "/usr/bin/env", "bash", "-c", cmd };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes commands over SSH
|
||||
*/
|
||||
protected class RemoteSudoShell extends Shell.ShellCommandExecutor {
|
||||
private String hostname;
|
||||
|
||||
public RemoteSudoShell(String hostname, String[] execString, long timeout) {
|
||||
this(hostname, execString, null, null, timeout);
|
||||
}
|
||||
|
||||
public RemoteSudoShell(String hostname, String[] execString, File dir, Map<String, String> env,
|
||||
long timeout) {
|
||||
super(execString, dir, env, timeout);
|
||||
this.hostname = hostname;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute() throws IOException {
|
||||
super.execute();
|
||||
public String[] getExecString() {
|
||||
String at = sshUserName.isEmpty() ? "" : "@";
|
||||
String remoteCmd = StringUtils.join(super.getExecString(), " ");
|
||||
String cmd = String.format(tunnelSudoCmd, sshOptions, sshUserName, at, hostname, remoteCmd,
|
||||
timeOutInterval/1000f);
|
||||
LOG.info("Executing full command [" + cmd + "]");
|
||||
return new String[] { "/usr/bin/env", "bash", "-c", cmd };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -299,7 +331,8 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
*/
|
||||
private Pair<Integer, String> exec(String hostname, ServiceType service, String... cmd)
|
||||
throws IOException {
|
||||
LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname);
|
||||
LOG.info("Executing remote command: {} , hostname:{}", StringUtils.join(cmd, " "),
|
||||
hostname);
|
||||
|
||||
RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd);
|
||||
try {
|
||||
|
@ -312,8 +345,8 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
+ ", stdout: " + output);
|
||||
}
|
||||
|
||||
LOG.info("Executed remote command, exit code:" + shell.getExitCode()
|
||||
+ " , output:" + shell.getOutput());
|
||||
LOG.info("Executed remote command, exit code:{} , output:{}", shell.getExitCode(),
|
||||
shell.getOutput());
|
||||
|
||||
return new Pair<>(shell.getExitCode(), shell.getOutput());
|
||||
}
|
||||
|
@ -331,7 +364,52 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
retryCounter.sleepUntilNextRetry();
|
||||
} catch (InterruptedException ex) {
|
||||
// ignore
|
||||
LOG.warn("Sleep Interrupted:" + ex);
|
||||
LOG.warn("Sleep Interrupted:", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the given command on the host using SSH
|
||||
* @return pair of exit code and command output
|
||||
* @throws IOException if something goes wrong.
|
||||
*/
|
||||
public Pair<Integer, String> execSudo(String hostname, long timeout, String... cmd)
|
||||
throws IOException {
|
||||
LOG.info("Executing remote command: {} , hostname:{}", StringUtils.join(cmd, " "),
|
||||
hostname);
|
||||
|
||||
RemoteSudoShell shell = new RemoteSudoShell(hostname, cmd, timeout);
|
||||
try {
|
||||
shell.execute();
|
||||
} catch (Shell.ExitCodeException ex) {
|
||||
// capture the stdout of the process as well.
|
||||
String output = shell.getOutput();
|
||||
// add output for the ExitCodeException.
|
||||
throw new Shell.ExitCodeException(ex.getExitCode(), "stderr: " + ex.getMessage()
|
||||
+ ", stdout: " + output);
|
||||
}
|
||||
|
||||
LOG.info("Executed remote command, exit code:{} , output:{}", shell.getExitCode(),
|
||||
shell.getOutput());
|
||||
|
||||
return new Pair<>(shell.getExitCode(), shell.getOutput());
|
||||
}
|
||||
|
||||
public Pair<Integer, String> execSudoWithRetries(String hostname, long timeout, String... cmd)
|
||||
throws IOException {
|
||||
RetryCounter retryCounter = retryCounterFactory.create();
|
||||
while (true) {
|
||||
try {
|
||||
return execSudo(hostname, timeout, cmd);
|
||||
} catch (IOException e) {
|
||||
retryOrThrow(retryCounter, e, hostname, cmd);
|
||||
}
|
||||
try {
|
||||
retryCounter.sleepUntilNextRetry();
|
||||
} catch (InterruptedException ex) {
|
||||
// ignore
|
||||
LOG.warn("Sleep Interrupted:", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Action that adds high cpu load to a random regionserver for a given duration
|
||||
*/
|
||||
public class AddCPULoadAction extends SudoCommandAction {
|
||||
protected static final Logger LOG = LoggerFactory.getLogger(AddCPULoadAction.class);
|
||||
private static final String CPU_LOAD_COMMAND =
|
||||
"seq 1 %s | xargs -I{} -n 1 -P %s timeout %s dd if=/dev/urandom of=/dev/null bs=1M " +
|
||||
"iflag=fullblock";
|
||||
|
||||
private final long duration;
|
||||
private long processes;
|
||||
|
||||
/**
|
||||
* Add high load to cpu
|
||||
*
|
||||
* @param duration Duration that this thread should generate the load for in milliseconds
|
||||
* @param processes The number of parallel processes, should be equal to cpu threads for max load
|
||||
*/
|
||||
public AddCPULoadAction(long duration, long processes, long timeout) {
|
||||
super(timeout);
|
||||
this.duration = duration;
|
||||
this.processes = processes;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute AddCPULoadAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudo(hostname, timeout, getCommand());
|
||||
} catch (IOException ex){
|
||||
//This will always happen. We use timeout to kill a continously running process
|
||||
//after the duration expires
|
||||
}
|
||||
LOG.info("Finished to execute AddCPULoadAction");
|
||||
}
|
||||
|
||||
private String getCommand(){
|
||||
return String.format(CPU_LOAD_COMMAND, processes, processes, duration/1000f);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import org.apache.commons.lang3.RandomUtils;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFile;
|
||||
import org.apache.hadoop.hbase.util.CommonFSUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Action corrupts HFiles with a certain chance.
|
||||
*/
|
||||
public class CorruptDataFilesAction extends Action {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(CorruptDataFilesAction.class);
|
||||
private float chance;
|
||||
|
||||
/**
|
||||
* Corrupts HFiles with a certain chance
|
||||
* @param chance chance to corrupt any give data file (0.5 => 50%)
|
||||
*/
|
||||
public CorruptDataFilesAction(float chance) {
|
||||
this.chance = chance * 100;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
LOG.info("Start corrupting data files");
|
||||
|
||||
FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf());
|
||||
Path rootDir = CommonFSUtils.getRootDir(getConf());
|
||||
Path defaultDir = rootDir.suffix("/data/default");
|
||||
RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(defaultDir, true);
|
||||
while (iterator.hasNext()){
|
||||
LocatedFileStatus status = iterator.next();
|
||||
if(!HFile.isHFileFormat(fs, status.getPath())){
|
||||
continue;
|
||||
}
|
||||
if(RandomUtils.nextFloat(0, 100) > chance){
|
||||
continue;
|
||||
}
|
||||
|
||||
FSDataOutputStream out = fs.create(status.getPath(), true);
|
||||
try {
|
||||
out.write(0);
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
LOG.info("Corrupting {}", status.getPath());
|
||||
}
|
||||
LOG.info("Done corrupting data files");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* Corrupt network packages on a random regionserver.
|
||||
*/
|
||||
public class CorruptPackagesCommandAction extends TCCommandAction {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(CorruptPackagesCommandAction.class);
|
||||
private float ratio;
|
||||
private long duration;
|
||||
|
||||
/**
|
||||
* Corrupt network packages on a random regionserver.
|
||||
*
|
||||
* @param ratio the ratio of packages corrupted
|
||||
* @param duration the time this issue persists in milliseconds
|
||||
* @param timeout the timeout for executing required commands on the region server in milliseconds
|
||||
* @param network network interface the regionserver uses for communication
|
||||
*/
|
||||
public CorruptPackagesCommandAction(float ratio, long duration, long timeout, String network) {
|
||||
super(timeout, network);
|
||||
this.ratio = ratio;
|
||||
this.duration = duration;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute CorruptPackagesCommandAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
|
||||
Thread.sleep(duration);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("Failed to run the command for the full duration", e);
|
||||
} finally {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
|
||||
}
|
||||
|
||||
LOG.info("Finished to execute CorruptPackagesCommandAction");
|
||||
}
|
||||
|
||||
private String getCommand(String operation){
|
||||
return String.format("tc qdisc %s dev %s root netem corrupt %s%%", operation, network,
|
||||
ratio * 100);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Action adds latency to communication on a random regionserver.
|
||||
*/
|
||||
public class DelayPackagesCommandAction extends TCCommandAction {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DelayPackagesCommandAction.class);
|
||||
private long delay;
|
||||
private long duration;
|
||||
|
||||
/**
|
||||
* Adds latency to communication on a random region server
|
||||
*
|
||||
* @param delay the latency wil be delay +/-50% in milliseconds
|
||||
* @param duration the time this issue persists in milliseconds
|
||||
* @param timeout the timeout for executing required commands on the region server in milliseconds
|
||||
* @param network network interface the regionserver uses for communication
|
||||
*/
|
||||
public DelayPackagesCommandAction(long delay, long duration, long timeout, String network) {
|
||||
super(timeout, network);
|
||||
this.delay = delay;
|
||||
this.duration = duration;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute DelayPackagesCommandAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
|
||||
Thread.sleep(duration);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("Failed to run the command for the full duration", e);
|
||||
} finally {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
|
||||
}
|
||||
|
||||
LOG.info("Finished to execute DelayPackagesCommandAction");
|
||||
}
|
||||
|
||||
private String getCommand(String operation){
|
||||
return String.format("tc qdisc %s dev %s root netem delay %sms %sms",
|
||||
operation, network, delay, delay/2);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import org.apache.commons.lang3.RandomUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFile;
|
||||
import org.apache.hadoop.hbase.util.CommonFSUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Action deletes HFiles with a certain chance.
|
||||
*/
|
||||
public class DeleteDataFilesAction extends Action {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DeleteDataFilesAction.class);
|
||||
private float chance;
|
||||
|
||||
/**
|
||||
* Delets HFiles with a certain chance
|
||||
* @param chance chance to delete any give data file (0.5 => 50%)
|
||||
*/
|
||||
public DeleteDataFilesAction(float chance) {
|
||||
this.chance = chance * 100;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
LOG.info("Start deleting data files");
|
||||
FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf());
|
||||
Path rootDir = CommonFSUtils.getRootDir(getConf());
|
||||
Path defaultDir = rootDir.suffix("/data/default");
|
||||
RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(defaultDir, true);
|
||||
while (iterator.hasNext()){
|
||||
LocatedFileStatus status = iterator.next();
|
||||
if(!HFile.isHFileFormat(fs, status.getPath())){
|
||||
continue;
|
||||
}
|
||||
if(RandomUtils.nextFloat(0, 100) > chance){
|
||||
continue;
|
||||
}
|
||||
fs.delete(status.getPath());
|
||||
LOG.info("Deleting {}", status.getPath());
|
||||
}
|
||||
LOG.info("Done deleting data files");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* Duplicate network packages on a random regionserver.
|
||||
*/
|
||||
public class DuplicatePackagesCommandAction extends TCCommandAction {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DuplicatePackagesCommandAction.class);
|
||||
private float ratio;
|
||||
private long duration;
|
||||
|
||||
/**
|
||||
* Duplicate network packages on a random regionserver.
|
||||
*
|
||||
* @param ratio the ratio of packages duplicated
|
||||
* @param duration the time this issue persists in milliseconds
|
||||
* @param timeout the timeout for executing required commands on the region server in milliseconds
|
||||
* @param network network interface the regionserver uses for communication
|
||||
*/
|
||||
public DuplicatePackagesCommandAction(float ratio, long duration, long timeout, String network) {
|
||||
super(timeout, network);
|
||||
this.ratio = ratio;
|
||||
this.duration = duration;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute DuplicatePackagesCommandAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
|
||||
Thread.sleep(duration);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("Failed to run the command for the full duration", e);
|
||||
} finally {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
|
||||
}
|
||||
|
||||
LOG.info("Finished to execute DuplicatePackagesCommandAction");
|
||||
}
|
||||
|
||||
private String getCommand(String operation){
|
||||
return String.format("tc qdisc %s dev %s root netem duplicate %s%%", operation, network,
|
||||
ratio * 100);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* Fill the disk on a random regionserver.
|
||||
*/
|
||||
public class FillDiskCommandAction extends SudoCommandAction {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FillDiskCommandAction.class);
|
||||
private long size;
|
||||
private long duration;
|
||||
private String path;
|
||||
|
||||
/**
|
||||
* Fill the disk on a random regionserver.
|
||||
* Please note that the file will be created regardless of the set duration or timeout.
|
||||
* So please use timeout and duration big enough to avoid complication caused by retries.
|
||||
*
|
||||
* @param size size of the generated file in MB or fill the disk if set to 0
|
||||
* @param duration the time this issue persists in milliseconds
|
||||
* @param path the path to the generated file
|
||||
* @param timeout the timeout for executing required commands on the region server in milliseconds
|
||||
*/
|
||||
public FillDiskCommandAction(long size, long duration, String path, long timeout) {
|
||||
super(timeout);
|
||||
this.size = size;
|
||||
this.duration = duration;
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute FillDiskCommandAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getFillCommand());
|
||||
Thread.sleep(duration);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("Failed to run the command for the full duration", e);
|
||||
} finally {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getClearCommand());
|
||||
}
|
||||
|
||||
LOG.info("Finished to execute FillDiskCommandAction");
|
||||
}
|
||||
|
||||
private String getFillCommand(){
|
||||
if (size == 0){
|
||||
return String.format("dd if=/dev/urandom of=%s/garbage bs=1M iflag=fullblock", path);
|
||||
}
|
||||
return String.format("dd if=/dev/urandom of=%s/garbage bs=1M count=%s iflag=fullblock",
|
||||
path, size);
|
||||
}
|
||||
|
||||
private String getClearCommand(){
|
||||
return String.format("rm -f %s/garbage", path);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* Lose network packages on a random regionserver.
|
||||
*/
|
||||
public class LosePackagesCommandAction extends TCCommandAction {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(LosePackagesCommandAction.class);
|
||||
private float ratio;
|
||||
private long duration;
|
||||
|
||||
/**
|
||||
* Lose network packages on a random regionserver.
|
||||
*
|
||||
* @param ratio the ratio of packages lost
|
||||
* @param duration the time this issue persists in milliseconds
|
||||
* @param timeout the timeout for executing required commands on the region server in milliseconds
|
||||
* @param network network interface the regionserver uses for communication
|
||||
*/
|
||||
public LosePackagesCommandAction(float ratio, long duration, long timeout, String network) {
|
||||
super(timeout, network);
|
||||
this.ratio = ratio;
|
||||
this.duration = duration;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute LosePackagesCommandAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
|
||||
Thread.sleep(duration);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("Failed to run the command for the full duration", e);
|
||||
} finally {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
|
||||
}
|
||||
|
||||
LOG.info("Finished to execute LosePackagesCommandAction");
|
||||
}
|
||||
|
||||
private String getCommand(String operation){
|
||||
return String.format("tc qdisc %s dev %s root netem loss %s%%", operation, network,
|
||||
ratio * 100);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* Reorder network packages on a random regionserver.
|
||||
*/
|
||||
public class ReorderPackagesCommandAction extends TCCommandAction {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ReorderPackagesCommandAction.class);
|
||||
private float ratio;
|
||||
private long duration;
|
||||
private long delay;
|
||||
|
||||
/**
|
||||
* Reorder network packages on a random regionserver.
|
||||
*
|
||||
* @param ratio the ratio of packages reordered
|
||||
* @param duration the time this issue persists in milliseconds
|
||||
* @param delay the delay between reordered and non-reordered packages in milliseconds
|
||||
* @param timeout the timeout for executing required commands on the region server in milliseconds
|
||||
* @param network network interface the regionserver uses for communication
|
||||
*/
|
||||
public ReorderPackagesCommandAction(float ratio, long duration, long delay, long timeout,
|
||||
String network) {
|
||||
super(timeout, network);
|
||||
this.ratio = ratio;
|
||||
this.duration = duration;
|
||||
this.delay = delay;
|
||||
}
|
||||
|
||||
protected void localPerform() throws IOException {
|
||||
LOG.info("Starting to execute ReorderPackagesCommandAction");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
|
||||
String hostname = server.getHostname();
|
||||
|
||||
try {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
|
||||
Thread.sleep(duration);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.debug("Failed to run the command for the full duration", e);
|
||||
} finally {
|
||||
clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
|
||||
}
|
||||
|
||||
LOG.info("Finished to execute ReorderPackagesCommandAction");
|
||||
}
|
||||
|
||||
private String getCommand(String operation){
|
||||
return String.format("tc qdisc %s dev %s root netem delay %sms reorder %s%% 50%",
|
||||
operation, network, delay, ratio * 100);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.DistributedHBaseCluster;
|
||||
import org.apache.hadoop.hbase.HBaseCluster;
|
||||
import org.apache.hadoop.hbase.HBaseClusterManager;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Base class for performing Actions based on linux commands requiring sudo privileges
|
||||
*/
|
||||
abstract public class SudoCommandAction extends Action {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SudoCommandAction.class);
|
||||
|
||||
protected long timeout;
|
||||
protected HBaseClusterManager clusterManager;
|
||||
|
||||
public SudoCommandAction(long timeout) {
|
||||
this.timeout = timeout;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(ActionContext context) throws IOException {
|
||||
super.init(context);
|
||||
HBaseCluster cluster = context.getHBaseCluster();
|
||||
if(cluster != null && cluster instanceof DistributedHBaseCluster){
|
||||
Object manager = ((DistributedHBaseCluster)cluster).getClusterManager();
|
||||
if(manager != null && manager instanceof HBaseClusterManager){
|
||||
clusterManager = (HBaseClusterManager) manager;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
if(clusterManager == null){
|
||||
LOG.info("Couldn't perform command action, it requires a distributed cluster.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
localPerform();
|
||||
}
|
||||
|
||||
abstract protected void localPerform() throws IOException;
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
/**
|
||||
* Base class for tc command actions
|
||||
*/
|
||||
abstract public class TCCommandAction extends SudoCommandAction {
|
||||
protected static final String ADD = "add";
|
||||
protected static final String DELETE = "del";
|
||||
protected String network;
|
||||
|
||||
public TCCommandAction(long timeout, String network) {
|
||||
super(timeout);
|
||||
this.network = network;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.factories;
|
||||
|
||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||
import org.apache.hadoop.hbase.chaos.actions.CorruptDataFilesAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DeleteDataFilesAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
||||
|
||||
/**
|
||||
* A chaos monkey to delete and corrupt regionserver data, requires a user with
|
||||
* passwordless ssh access to the cluster and sudo privileges.
|
||||
* Highly destructive
|
||||
*/
|
||||
public class DataIssuesMonkeyFactory extends MonkeyFactory {
|
||||
|
||||
private long action1Period;
|
||||
private long action2Period;
|
||||
|
||||
private float chanceToAct;
|
||||
|
||||
@Override
|
||||
public ChaosMonkey build() {
|
||||
loadProperties();
|
||||
|
||||
// Highly destructive actions to mess things around.
|
||||
Action[] actions1 = new Action[] {
|
||||
new DeleteDataFilesAction(chanceToAct),
|
||||
new CorruptDataFilesAction(chanceToAct)
|
||||
};
|
||||
|
||||
// Action to log more info for debugging
|
||||
Action[] actions2 = new Action[] {
|
||||
new DumpClusterStatusAction()
|
||||
};
|
||||
|
||||
return new PolicyBasedChaosMonkey(util,
|
||||
new PeriodicRandomActionPolicy(action1Period, actions1),
|
||||
new PeriodicRandomActionPolicy(action2Period, actions2));
|
||||
}
|
||||
|
||||
private void loadProperties() {
|
||||
action1Period = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.PERIODIC_ACTION1_PERIOD,
|
||||
MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + ""));
|
||||
action2Period = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.PERIODIC_ACTION2_PERIOD,
|
||||
MonkeyConstants.DEFAULT_PERIODIC_ACTION2_PERIOD + ""));
|
||||
chanceToAct = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.DATA_ISSUE_CHANCE,
|
||||
MonkeyConstants.DEFAULT_DATA_ISSUE_CHANCE+ ""));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.factories;
|
||||
|
||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||
import org.apache.hadoop.hbase.chaos.actions.AddCPULoadAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.CorruptPackagesCommandAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DelayPackagesCommandAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DuplicatePackagesCommandAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.FillDiskCommandAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.LosePackagesCommandAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ReorderPackagesCommandAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
||||
|
||||
/**
|
||||
* A chaos monkey to create distributed cluster related issues, requires a user with
|
||||
* passwordless ssh access to the cluster and sudo privileges.
|
||||
*/
|
||||
public class DistributedIssuesMonkeyFactory extends MonkeyFactory {
|
||||
|
||||
private long action1Period;
|
||||
private long action2Period;
|
||||
|
||||
private long cpuLoadDuration;
|
||||
private long cpuLoadProcesses;
|
||||
private long networkIssueTimeout;
|
||||
private long networkIssueDuration;
|
||||
private float networkIssueRation;
|
||||
private long networkIssueDelay;
|
||||
private String networkIssueInterface;
|
||||
private long fillDiskTimeout;
|
||||
private String fillDiskPath;
|
||||
private long fillDiskFileSize;
|
||||
private long fillDiskIssueduration;
|
||||
|
||||
@Override public ChaosMonkey build() {
|
||||
loadProperties();
|
||||
|
||||
Action[] actions1 = new Action[] {
|
||||
new AddCPULoadAction(cpuLoadDuration, cpuLoadProcesses, networkIssueTimeout),
|
||||
new CorruptPackagesCommandAction(networkIssueRation, networkIssueDuration,
|
||||
networkIssueTimeout, networkIssueInterface),
|
||||
new DuplicatePackagesCommandAction(networkIssueRation, networkIssueDuration,
|
||||
networkIssueTimeout, networkIssueInterface),
|
||||
new LosePackagesCommandAction(networkIssueRation, networkIssueDuration,
|
||||
networkIssueTimeout, networkIssueInterface),
|
||||
new DelayPackagesCommandAction(networkIssueDelay, networkIssueDuration,
|
||||
networkIssueTimeout, networkIssueInterface),
|
||||
new ReorderPackagesCommandAction(networkIssueRation, networkIssueDuration,
|
||||
networkIssueDelay, networkIssueTimeout, networkIssueInterface),
|
||||
new FillDiskCommandAction(fillDiskFileSize, fillDiskIssueduration, fillDiskPath,
|
||||
fillDiskTimeout)};
|
||||
|
||||
// Action to log more info for debugging
|
||||
Action[] actions2 = new Action[] {new DumpClusterStatusAction()};
|
||||
|
||||
return new PolicyBasedChaosMonkey(util, new PeriodicRandomActionPolicy(action1Period, actions1),
|
||||
new PeriodicRandomActionPolicy(action2Period, actions2));
|
||||
}
|
||||
|
||||
private void loadProperties() {
|
||||
action1Period = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.PERIODIC_ACTION1_PERIOD,
|
||||
MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + ""));
|
||||
action2Period = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.PERIODIC_ACTION2_PERIOD,
|
||||
MonkeyConstants.DEFAULT_PERIODIC_ACTION2_PERIOD + ""));
|
||||
cpuLoadDuration = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.CPU_LOAD_DURATION,
|
||||
MonkeyConstants.DEFAULT_CPU_LOAD_DURATION + ""));
|
||||
cpuLoadProcesses = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.CPU_LOAD_PROCESSES,
|
||||
MonkeyConstants.DEFAULT_CPU_LOAD_PROCESSES + ""));
|
||||
networkIssueTimeout = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.NETWORK_ISSUE_COMMAND_TIMEOUT,
|
||||
MonkeyConstants.DEFAULT_NETWORK_ISSUE_COMMAND_TIMEOUT + ""));
|
||||
networkIssueDuration = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.NETWORK_ISSUE_DURATION,
|
||||
MonkeyConstants.DEFAULT_NETWORK_ISSUE_DURATION + ""));
|
||||
networkIssueRation = Float.parseFloat(this.properties
|
||||
.getProperty(MonkeyConstants.NETWORK_ISSUE_RATIO,
|
||||
MonkeyConstants.DEFAULT_NETWORK_ISSUE_RATIO + ""));
|
||||
networkIssueDelay = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.NETWORK_ISSUE_DELAY,
|
||||
MonkeyConstants.DEFAULT_NETWORK_ISSUE_DELAY + ""));
|
||||
networkIssueInterface = this.properties
|
||||
.getProperty(MonkeyConstants.NETWORK_ISSUE_INTERFACE,
|
||||
MonkeyConstants.DEFAULT_NETWORK_ISSUE_INTERFACE + "");
|
||||
fillDiskTimeout = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.FILL_DISK_COMMAND_TIMEOUT,
|
||||
MonkeyConstants.DEFAULT_FILL_DISK_COMMAND_TIMEOUT + ""));
|
||||
fillDiskPath = this.properties
|
||||
.getProperty(MonkeyConstants.FILL_DISK_PATH,
|
||||
MonkeyConstants.DEFAULT_FILL_DISK_PATH + "");
|
||||
fillDiskFileSize = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.FILL_DISK_FILE_SIZE,
|
||||
MonkeyConstants.DEFAULT_FILL_DISK_FILE_SIZE + ""));
|
||||
fillDiskIssueduration = Long.parseLong(this.properties
|
||||
.getProperty(MonkeyConstants.FILL_DISK_ISSUE_DURATION,
|
||||
MonkeyConstants.DEFAULT_FILL_DISK_ISSUE_DURATION + ""));
|
||||
}
|
||||
}
|
|
@ -48,13 +48,26 @@ public interface MonkeyConstants {
|
|||
String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time";
|
||||
String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time";
|
||||
String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio";
|
||||
String CPU_LOAD_DURATION = "cpu.load.duration";
|
||||
String CPU_LOAD_PROCESSES = "cpu.load.processes";
|
||||
String NETWORK_ISSUE_COMMAND_TIMEOUT = "network.issue.command.timeout";
|
||||
String NETWORK_ISSUE_DURATION = "network.issueduration";
|
||||
String NETWORK_ISSUE_RATIO = "network.issue.ratio";
|
||||
String NETWORK_ISSUE_DELAY = "network.issue.delay";
|
||||
String NETWORK_ISSUE_INTERFACE = "network.issue.interface";
|
||||
//should be big enough to create the file
|
||||
String FILL_DISK_COMMAND_TIMEOUT = "fill.disk.command.timeout";
|
||||
String FILL_DISK_PATH = "fill.disk.path";
|
||||
String FILL_DISK_FILE_SIZE = "fill.disk.file.size";
|
||||
String FILL_DISK_ISSUE_DURATION = "fill.disk.issue.duration";
|
||||
String DATA_ISSUE_CHANCE = "data.issue.chance";
|
||||
|
||||
/**
|
||||
* A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky.
|
||||
*/
|
||||
Set<String> MONKEY_CONFIGURATION_KEY_PREFIXES = new HashSet<>(
|
||||
Arrays.asList("sdm.", "move.", "restart.", "batch.", "rolling.", "compact.",
|
||||
"unbalance.", "decrease."));
|
||||
Arrays.asList("sdm.", "move.", "restart.", "batch.", "rolling.", "compact.", "unbalance.",
|
||||
"decrease.", "decrease.", "graceful.", "cpu.", "network.", "fill.", "data."));
|
||||
|
||||
long DEFAULT_PERIODIC_ACTION1_PERIOD = 60 * 1000;
|
||||
long DEFAULT_PERIODIC_ACTION2_PERIOD = 90 * 1000;
|
||||
|
@ -81,4 +94,16 @@ public interface MonkeyConstants {
|
|||
long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000;
|
||||
long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000;
|
||||
float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f;
|
||||
long DEFAULT_CPU_LOAD_DURATION = 5 * 60 * 1000;
|
||||
long DEFAULT_CPU_LOAD_PROCESSES = 2;
|
||||
long DEFAULT_NETWORK_ISSUE_COMMAND_TIMEOUT = 30 * 1000;
|
||||
long DEFAULT_NETWORK_ISSUE_DURATION = 60 * 1000;
|
||||
float DEFAULT_NETWORK_ISSUE_RATIO = 0.1f;
|
||||
long DEFAULT_NETWORK_ISSUE_DELAY = 100;
|
||||
String DEFAULT_NETWORK_ISSUE_INTERFACE = "eth0";
|
||||
long DEFAULT_FILL_DISK_COMMAND_TIMEOUT = 5 * 60 * 1000 + 30 * 1000;//duration + timeout
|
||||
String DEFAULT_FILL_DISK_PATH = "/tmp";
|
||||
long DEFAULT_FILL_DISK_FILE_SIZE = 0;
|
||||
long DEFAULT_FILL_DISK_ISSUE_DURATION = 5 * 60 * 1000;
|
||||
float DEFAULT_DATA_ISSUE_CHANCE = 0.01f;
|
||||
}
|
||||
|
|
|
@ -77,6 +77,8 @@ public abstract class MonkeyFactory {
|
|||
public static final String MOB_NO_KILL = "mobNoKill";
|
||||
public static final String MOB_SLOW_DETERMINISTIC = "mobSlowDeterministic";
|
||||
public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling";
|
||||
public static final String DISTRIBUTED_ISSUES = "distributedIssues";
|
||||
public static final String DATA_ISSUES = "dataIssues";
|
||||
|
||||
public static Map<String, MonkeyFactory> FACTORIES = ImmutableMap.<String,MonkeyFactory>builder()
|
||||
.put(CALM, new CalmMonkeyFactory())
|
||||
|
@ -89,6 +91,8 @@ public abstract class MonkeyFactory {
|
|||
.put(MOB_NO_KILL, new MobNoKillMonkeyFactory())
|
||||
.put(MOB_SLOW_DETERMINISTIC, new MobNoKillMonkeyFactory())
|
||||
.put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory())
|
||||
.put(DISTRIBUTED_ISSUES, new DistributedIssuesMonkeyFactory())
|
||||
.put(DATA_ISSUES, new DataIssuesMonkeyFactory())
|
||||
.build();
|
||||
|
||||
public static MonkeyFactory getFactory(String factoryName) {
|
||||
|
|
|
@ -78,10 +78,10 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,10 +74,10 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory {
|
|||
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -191,10 +191,10 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,9 +6,9 @@
|
|||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -105,10 +105,10 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
|||
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
|
||||
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||
MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
|
||||
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue