From d2142a8ebb00eafb69e00147afa51fff4331014c Mon Sep 17 00:00:00 2001 From: BukrosSzabolcs Date: Wed, 6 Nov 2019 07:43:01 -0600 Subject: [PATCH] HBASE-23085 Network and Data related Actions Add monkey actions: - manipulate network packages with tc (reorder, loose,...) - add CPU load - fill the disk - corrupt or delete regionserver data files Extend HBaseClusterManager to allow sudo calls. Signed-off-by: Josh Elser Signed-off-by: Balazs Meszaros --- .../hadoop/hbase/HBaseClusterManager.java | 90 ++++++++++++- .../hbase/chaos/actions/AddCPULoadAction.java | 69 ++++++++++ .../chaos/actions/CorruptDataFilesAction.java | 75 +++++++++++ .../actions/CorruptPackagesCommandAction.java | 72 +++++++++++ .../actions/DelayPackagesCommandAction.java | 71 ++++++++++ .../chaos/actions/DeleteDataFilesAction.java | 66 ++++++++++ .../DuplicatePackagesCommandAction.java | 72 +++++++++++ .../chaos/actions/FillDiskCommandAction.java | 83 ++++++++++++ .../actions/LosePackagesCommandAction.java | 72 +++++++++++ .../actions/ReorderPackagesCommandAction.java | 76 +++++++++++ .../chaos/actions/SudoCommandAction.java | 70 ++++++++++ .../hbase/chaos/actions/TCCommandAction.java | 33 +++++ .../factories/DataIssuesMonkeyFactory.java | 72 +++++++++++ .../DistributedIssuesMonkeyFactory.java | 121 ++++++++++++++++++ .../chaos/factories/MonkeyConstants.java | 29 ++++- .../hbase/chaos/factories/MonkeyFactory.java | 4 + ...erAndDependenciesKillingMonkeyFactory.java | 8 +- .../factories/ServerKillingMonkeyFactory.java | 8 +- .../SlowDeterministicMonkeyFactory.java | 8 +- .../StressAssignmentManagerMonkeyFactory.java | 12 +- 20 files changed, 1085 insertions(+), 26 deletions(-) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java index f7c2fc65280..2f75c731bff 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java @@ -62,6 +62,15 @@ public class HBaseClusterManager extends Configured implements ClusterManager { "timeout 30 /usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\""; private String tunnelCmd; + /** + * The command format that is used to execute the remote command with sudo. Arguments: + * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host, + * 5 original command, 6 timeout. + */ + private static final String DEFAULT_TUNNEL_SUDO_CMD = + "timeout %6$s /usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo %5$s\""; + private String tunnelSudoCmd; + private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts"; private static final int DEFAULT_RETRY_ATTEMPTS = 5; @@ -86,6 +95,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager { sshOptions = (sshOptions == null) ? "" : sshOptions; sshUserName = (sshUserName == null) ? "" : sshUserName; tunnelCmd = conf.get("hbase.it.clustermanager.ssh.cmd", DEFAULT_TUNNEL_CMD); + tunnelSudoCmd = conf.get("hbase.it.clustermanager.ssh.sudo.cmd", DEFAULT_TUNNEL_SUDO_CMD); // Print out ssh special config if any. if ((sshUserName != null && sshUserName.length() > 0) || (sshOptions != null && sshOptions.length() > 0)) { @@ -152,10 +162,32 @@ public class HBaseClusterManager extends Configured implements ClusterManager { LOG.info("Executing full command [" + cmd + "]"); return new String[] { "/usr/bin/env", "bash", "-c", cmd }; } + } + + /** + * Executes commands over SSH + */ + protected class RemoteSudoShell extends Shell.ShellCommandExecutor { + private String hostname; + + public RemoteSudoShell(String hostname, String[] execString, long timeout) { + this(hostname, execString, null, null, timeout); + } + + public RemoteSudoShell(String hostname, String[] execString, File dir, Map env, + long timeout) { + super(execString, dir, env, timeout); + this.hostname = hostname; + } @Override - public void execute() throws IOException { - super.execute(); + public String[] getExecString() { + String at = sshUserName.isEmpty() ? "" : "@"; + String remoteCmd = StringUtils.join(super.getExecString(), " "); + String cmd = String.format(tunnelSudoCmd, sshOptions, sshUserName, at, hostname, remoteCmd, + timeOutInterval/1000f); + LOG.info("Executing full command [" + cmd + "]"); + return new String[] { "/usr/bin/env", "bash", "-c", cmd }; } } @@ -299,7 +331,8 @@ public class HBaseClusterManager extends Configured implements ClusterManager { */ private Pair exec(String hostname, ServiceType service, String... cmd) throws IOException { - LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname); + LOG.info("Executing remote command: {} , hostname:{}", StringUtils.join(cmd, " "), + hostname); RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd); try { @@ -312,8 +345,8 @@ public class HBaseClusterManager extends Configured implements ClusterManager { + ", stdout: " + output); } - LOG.info("Executed remote command, exit code:" + shell.getExitCode() - + " , output:" + shell.getOutput()); + LOG.info("Executed remote command, exit code:{} , output:{}", shell.getExitCode(), + shell.getOutput()); return new Pair<>(shell.getExitCode(), shell.getOutput()); } @@ -331,7 +364,52 @@ public class HBaseClusterManager extends Configured implements ClusterManager { retryCounter.sleepUntilNextRetry(); } catch (InterruptedException ex) { // ignore - LOG.warn("Sleep Interrupted:" + ex); + LOG.warn("Sleep Interrupted:", ex); + } + } + } + + /** + * Execute the given command on the host using SSH + * @return pair of exit code and command output + * @throws IOException if something goes wrong. + */ + public Pair execSudo(String hostname, long timeout, String... cmd) + throws IOException { + LOG.info("Executing remote command: {} , hostname:{}", StringUtils.join(cmd, " "), + hostname); + + RemoteSudoShell shell = new RemoteSudoShell(hostname, cmd, timeout); + try { + shell.execute(); + } catch (Shell.ExitCodeException ex) { + // capture the stdout of the process as well. + String output = shell.getOutput(); + // add output for the ExitCodeException. + throw new Shell.ExitCodeException(ex.getExitCode(), "stderr: " + ex.getMessage() + + ", stdout: " + output); + } + + LOG.info("Executed remote command, exit code:{} , output:{}", shell.getExitCode(), + shell.getOutput()); + + return new Pair<>(shell.getExitCode(), shell.getOutput()); + } + + public Pair execSudoWithRetries(String hostname, long timeout, String... cmd) + throws IOException { + RetryCounter retryCounter = retryCounterFactory.create(); + while (true) { + try { + return execSudo(hostname, timeout, cmd); + } catch (IOException e) { + retryOrThrow(retryCounter, e, hostname, cmd); + } + try { + retryCounter.sleepUntilNextRetry(); + } catch (InterruptedException ex) { + // ignore + LOG.warn("Sleep Interrupted:", ex); } } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java new file mode 100644 index 00000000000..9d6437e431b --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Action that adds high cpu load to a random regionserver for a given duration + */ +public class AddCPULoadAction extends SudoCommandAction { + protected static final Logger LOG = LoggerFactory.getLogger(AddCPULoadAction.class); + private static final String CPU_LOAD_COMMAND = + "seq 1 %s | xargs -I{} -n 1 -P %s timeout %s dd if=/dev/urandom of=/dev/null bs=1M " + + "iflag=fullblock"; + + private final long duration; + private long processes; + + /** + * Add high load to cpu + * + * @param duration Duration that this thread should generate the load for in milliseconds + * @param processes The number of parallel processes, should be equal to cpu threads for max load + */ + public AddCPULoadAction(long duration, long processes, long timeout) { + super(timeout); + this.duration = duration; + this.processes = processes; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute AddCPULoadAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudo(hostname, timeout, getCommand()); + } catch (IOException ex){ + //This will always happen. We use timeout to kill a continously running process + //after the duration expires + } + LOG.info("Finished to execute AddCPULoadAction"); + } + + private String getCommand(){ + return String.format(CPU_LOAD_COMMAND, processes, processes, duration/1000f); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java new file mode 100644 index 00000000000..83e8fe08a49 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import org.apache.commons.lang3.RandomUtils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Action corrupts HFiles with a certain chance. + */ +public class CorruptDataFilesAction extends Action { + private static final Logger LOG = LoggerFactory.getLogger(CorruptDataFilesAction.class); + private float chance; + + /** + * Corrupts HFiles with a certain chance + * @param chance chance to corrupt any give data file (0.5 => 50%) + */ + public CorruptDataFilesAction(float chance) { + this.chance = chance * 100; + } + + @Override + public void perform() throws Exception { + LOG.info("Start corrupting data files"); + + FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf()); + Path rootDir = CommonFSUtils.getRootDir(getConf()); + Path defaultDir = rootDir.suffix("/data/default"); + RemoteIterator iterator = fs.listFiles(defaultDir, true); + while (iterator.hasNext()){ + LocatedFileStatus status = iterator.next(); + if(!HFile.isHFileFormat(fs, status.getPath())){ + continue; + } + if(RandomUtils.nextFloat(0, 100) > chance){ + continue; + } + + FSDataOutputStream out = fs.create(status.getPath(), true); + try { + out.write(0); + } finally { + out.close(); + } + LOG.info("Corrupting {}", status.getPath()); + } + LOG.info("Done corrupting data files"); + } + +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java new file mode 100644 index 00000000000..a89d5587a79 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * Corrupt network packages on a random regionserver. + */ +public class CorruptPackagesCommandAction extends TCCommandAction { + private static final Logger LOG = LoggerFactory.getLogger(CorruptPackagesCommandAction.class); + private float ratio; + private long duration; + + /** + * Corrupt network packages on a random regionserver. + * + * @param ratio the ratio of packages corrupted + * @param duration the time this issue persists in milliseconds + * @param timeout the timeout for executing required commands on the region server in milliseconds + * @param network network interface the regionserver uses for communication + */ + public CorruptPackagesCommandAction(float ratio, long duration, long timeout, String network) { + super(timeout, network); + this.ratio = ratio; + this.duration = duration; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute CorruptPackagesCommandAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD)); + Thread.sleep(duration); + } catch (InterruptedException e) { + LOG.debug("Failed to run the command for the full duration", e); + } finally { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE)); + } + + LOG.info("Finished to execute CorruptPackagesCommandAction"); + } + + private String getCommand(String operation){ + return String.format("tc qdisc %s dev %s root netem corrupt %s%%", operation, network, + ratio * 100); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java new file mode 100644 index 00000000000..e4de0a270a8 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Action adds latency to communication on a random regionserver. + */ +public class DelayPackagesCommandAction extends TCCommandAction { + private static final Logger LOG = LoggerFactory.getLogger(DelayPackagesCommandAction.class); + private long delay; + private long duration; + + /** + * Adds latency to communication on a random region server + * + * @param delay the latency wil be delay +/-50% in milliseconds + * @param duration the time this issue persists in milliseconds + * @param timeout the timeout for executing required commands on the region server in milliseconds + * @param network network interface the regionserver uses for communication + */ + public DelayPackagesCommandAction(long delay, long duration, long timeout, String network) { + super(timeout, network); + this.delay = delay; + this.duration = duration; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute DelayPackagesCommandAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD)); + Thread.sleep(duration); + } catch (InterruptedException e) { + LOG.debug("Failed to run the command for the full duration", e); + } finally { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE)); + } + + LOG.info("Finished to execute DelayPackagesCommandAction"); + } + + private String getCommand(String operation){ + return String.format("tc qdisc %s dev %s root netem delay %sms %sms", + operation, network, delay, delay/2); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java new file mode 100644 index 00000000000..4919adce490 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import org.apache.commons.lang3.RandomUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.hbase.io.hfile.HFile; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Action deletes HFiles with a certain chance. + */ +public class DeleteDataFilesAction extends Action { + private static final Logger LOG = LoggerFactory.getLogger(DeleteDataFilesAction.class); + private float chance; + + /** + * Delets HFiles with a certain chance + * @param chance chance to delete any give data file (0.5 => 50%) + */ + public DeleteDataFilesAction(float chance) { + this.chance = chance * 100; + } + + @Override + public void perform() throws Exception { + LOG.info("Start deleting data files"); + FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf()); + Path rootDir = CommonFSUtils.getRootDir(getConf()); + Path defaultDir = rootDir.suffix("/data/default"); + RemoteIterator iterator = fs.listFiles(defaultDir, true); + while (iterator.hasNext()){ + LocatedFileStatus status = iterator.next(); + if(!HFile.isHFileFormat(fs, status.getPath())){ + continue; + } + if(RandomUtils.nextFloat(0, 100) > chance){ + continue; + } + fs.delete(status.getPath()); + LOG.info("Deleting {}", status.getPath()); + } + LOG.info("Done deleting data files"); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java new file mode 100644 index 00000000000..f3d54f18985 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * Duplicate network packages on a random regionserver. + */ +public class DuplicatePackagesCommandAction extends TCCommandAction { + private static final Logger LOG = LoggerFactory.getLogger(DuplicatePackagesCommandAction.class); + private float ratio; + private long duration; + + /** + * Duplicate network packages on a random regionserver. + * + * @param ratio the ratio of packages duplicated + * @param duration the time this issue persists in milliseconds + * @param timeout the timeout for executing required commands on the region server in milliseconds + * @param network network interface the regionserver uses for communication + */ + public DuplicatePackagesCommandAction(float ratio, long duration, long timeout, String network) { + super(timeout, network); + this.ratio = ratio; + this.duration = duration; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute DuplicatePackagesCommandAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD)); + Thread.sleep(duration); + } catch (InterruptedException e) { + LOG.debug("Failed to run the command for the full duration", e); + } finally { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE)); + } + + LOG.info("Finished to execute DuplicatePackagesCommandAction"); + } + + private String getCommand(String operation){ + return String.format("tc qdisc %s dev %s root netem duplicate %s%%", operation, network, + ratio * 100); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java new file mode 100644 index 00000000000..b7af31fffa5 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * Fill the disk on a random regionserver. + */ +public class FillDiskCommandAction extends SudoCommandAction { + private static final Logger LOG = LoggerFactory.getLogger(FillDiskCommandAction.class); + private long size; + private long duration; + private String path; + + /** + * Fill the disk on a random regionserver. + * Please note that the file will be created regardless of the set duration or timeout. + * So please use timeout and duration big enough to avoid complication caused by retries. + * + * @param size size of the generated file in MB or fill the disk if set to 0 + * @param duration the time this issue persists in milliseconds + * @param path the path to the generated file + * @param timeout the timeout for executing required commands on the region server in milliseconds + */ + public FillDiskCommandAction(long size, long duration, String path, long timeout) { + super(timeout); + this.size = size; + this.duration = duration; + this.path = path; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute FillDiskCommandAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudoWithRetries(hostname, timeout, getFillCommand()); + Thread.sleep(duration); + } catch (InterruptedException e) { + LOG.debug("Failed to run the command for the full duration", e); + } finally { + clusterManager.execSudoWithRetries(hostname, timeout, getClearCommand()); + } + + LOG.info("Finished to execute FillDiskCommandAction"); + } + + private String getFillCommand(){ + if (size == 0){ + return String.format("dd if=/dev/urandom of=%s/garbage bs=1M iflag=fullblock", path); + } + return String.format("dd if=/dev/urandom of=%s/garbage bs=1M count=%s iflag=fullblock", + path, size); + } + + private String getClearCommand(){ + return String.format("rm -f %s/garbage", path); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java new file mode 100644 index 00000000000..e44cac7ade2 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * Lose network packages on a random regionserver. + */ +public class LosePackagesCommandAction extends TCCommandAction { + private static final Logger LOG = LoggerFactory.getLogger(LosePackagesCommandAction.class); + private float ratio; + private long duration; + + /** + * Lose network packages on a random regionserver. + * + * @param ratio the ratio of packages lost + * @param duration the time this issue persists in milliseconds + * @param timeout the timeout for executing required commands on the region server in milliseconds + * @param network network interface the regionserver uses for communication + */ + public LosePackagesCommandAction(float ratio, long duration, long timeout, String network) { + super(timeout, network); + this.ratio = ratio; + this.duration = duration; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute LosePackagesCommandAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD)); + Thread.sleep(duration); + } catch (InterruptedException e) { + LOG.debug("Failed to run the command for the full duration", e); + } finally { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE)); + } + + LOG.info("Finished to execute LosePackagesCommandAction"); + } + + private String getCommand(String operation){ + return String.format("tc qdisc %s dev %s root netem loss %s%%", operation, network, + ratio * 100); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java new file mode 100644 index 00000000000..c1f196e830e --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * Reorder network packages on a random regionserver. + */ +public class ReorderPackagesCommandAction extends TCCommandAction { + private static final Logger LOG = LoggerFactory.getLogger(ReorderPackagesCommandAction.class); + private float ratio; + private long duration; + private long delay; + + /** + * Reorder network packages on a random regionserver. + * + * @param ratio the ratio of packages reordered + * @param duration the time this issue persists in milliseconds + * @param delay the delay between reordered and non-reordered packages in milliseconds + * @param timeout the timeout for executing required commands on the region server in milliseconds + * @param network network interface the regionserver uses for communication + */ + public ReorderPackagesCommandAction(float ratio, long duration, long delay, long timeout, + String network) { + super(timeout, network); + this.ratio = ratio; + this.duration = duration; + this.delay = delay; + } + + protected void localPerform() throws IOException { + LOG.info("Starting to execute ReorderPackagesCommandAction"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers()); + String hostname = server.getHostname(); + + try { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD)); + Thread.sleep(duration); + } catch (InterruptedException e) { + LOG.debug("Failed to run the command for the full duration", e); + } finally { + clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE)); + } + + LOG.info("Finished to execute ReorderPackagesCommandAction"); + } + + private String getCommand(String operation){ + return String.format("tc qdisc %s dev %s root netem delay %sms reorder %s%% 50%", + operation, network, delay, ratio * 100); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java new file mode 100644 index 00000000000..6092a5dbbc6 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +import org.apache.hadoop.hbase.DistributedHBaseCluster; +import org.apache.hadoop.hbase.HBaseCluster; +import org.apache.hadoop.hbase.HBaseClusterManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base class for performing Actions based on linux commands requiring sudo privileges + */ +abstract public class SudoCommandAction extends Action { + private static final Logger LOG = LoggerFactory.getLogger(SudoCommandAction.class); + + protected long timeout; + protected HBaseClusterManager clusterManager; + + public SudoCommandAction(long timeout) { + this.timeout = timeout; + } + + @Override + public void init(ActionContext context) throws IOException { + super.init(context); + HBaseCluster cluster = context.getHBaseCluster(); + if(cluster != null && cluster instanceof DistributedHBaseCluster){ + Object manager = ((DistributedHBaseCluster)cluster).getClusterManager(); + if(manager != null && manager instanceof HBaseClusterManager){ + clusterManager = (HBaseClusterManager) manager; + } + } + } + + @Override + public void perform() throws Exception { + if(clusterManager == null){ + LOG.info("Couldn't perform command action, it requires a distributed cluster."); + return; + } + + // Don't try the modify if we're stopping + if (context.isStopping()) { + return; + } + + localPerform(); + } + + abstract protected void localPerform() throws IOException; +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java new file mode 100644 index 00000000000..9444f876f72 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +/** + * Base class for tc command actions + */ +abstract public class TCCommandAction extends SudoCommandAction { + protected static final String ADD = "add"; + protected static final String DELETE = "del"; + protected String network; + + public TCCommandAction(long timeout, String network) { + super(timeout); + this.network = network; + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java new file mode 100644 index 00000000000..a06a9779e41 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.factories; + +import org.apache.hadoop.hbase.chaos.actions.Action; +import org.apache.hadoop.hbase.chaos.actions.CorruptDataFilesAction; +import org.apache.hadoop.hbase.chaos.actions.DeleteDataFilesAction; +import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; +import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; + +/** + * A chaos monkey to delete and corrupt regionserver data, requires a user with + * passwordless ssh access to the cluster and sudo privileges. + * Highly destructive + */ +public class DataIssuesMonkeyFactory extends MonkeyFactory { + + private long action1Period; + private long action2Period; + + private float chanceToAct; + + @Override + public ChaosMonkey build() { + loadProperties(); + + // Highly destructive actions to mess things around. + Action[] actions1 = new Action[] { + new DeleteDataFilesAction(chanceToAct), + new CorruptDataFilesAction(chanceToAct) + }; + + // Action to log more info for debugging + Action[] actions2 = new Action[] { + new DumpClusterStatusAction() + }; + + return new PolicyBasedChaosMonkey(util, + new PeriodicRandomActionPolicy(action1Period, actions1), + new PeriodicRandomActionPolicy(action2Period, actions2)); + } + + private void loadProperties() { + action1Period = Long.parseLong(this.properties.getProperty( + MonkeyConstants.PERIODIC_ACTION1_PERIOD, + MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + "")); + action2Period = Long.parseLong(this.properties.getProperty( + MonkeyConstants.PERIODIC_ACTION2_PERIOD, + MonkeyConstants.DEFAULT_PERIODIC_ACTION2_PERIOD + "")); + chanceToAct = Float.parseFloat(this.properties.getProperty( + MonkeyConstants.DATA_ISSUE_CHANCE, + MonkeyConstants.DEFAULT_DATA_ISSUE_CHANCE+ "")); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java new file mode 100644 index 00000000000..745f1b9aae8 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.factories; + +import org.apache.hadoop.hbase.chaos.actions.Action; +import org.apache.hadoop.hbase.chaos.actions.AddCPULoadAction; +import org.apache.hadoop.hbase.chaos.actions.CorruptPackagesCommandAction; +import org.apache.hadoop.hbase.chaos.actions.DelayPackagesCommandAction; +import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; +import org.apache.hadoop.hbase.chaos.actions.DuplicatePackagesCommandAction; +import org.apache.hadoop.hbase.chaos.actions.FillDiskCommandAction; +import org.apache.hadoop.hbase.chaos.actions.LosePackagesCommandAction; +import org.apache.hadoop.hbase.chaos.actions.ReorderPackagesCommandAction; +import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; + +/** + * A chaos monkey to create distributed cluster related issues, requires a user with + * passwordless ssh access to the cluster and sudo privileges. + */ +public class DistributedIssuesMonkeyFactory extends MonkeyFactory { + + private long action1Period; + private long action2Period; + + private long cpuLoadDuration; + private long cpuLoadProcesses; + private long networkIssueTimeout; + private long networkIssueDuration; + private float networkIssueRation; + private long networkIssueDelay; + private String networkIssueInterface; + private long fillDiskTimeout; + private String fillDiskPath; + private long fillDiskFileSize; + private long fillDiskIssueduration; + + @Override public ChaosMonkey build() { + loadProperties(); + + Action[] actions1 = new Action[] { + new AddCPULoadAction(cpuLoadDuration, cpuLoadProcesses, networkIssueTimeout), + new CorruptPackagesCommandAction(networkIssueRation, networkIssueDuration, + networkIssueTimeout, networkIssueInterface), + new DuplicatePackagesCommandAction(networkIssueRation, networkIssueDuration, + networkIssueTimeout, networkIssueInterface), + new LosePackagesCommandAction(networkIssueRation, networkIssueDuration, + networkIssueTimeout, networkIssueInterface), + new DelayPackagesCommandAction(networkIssueDelay, networkIssueDuration, + networkIssueTimeout, networkIssueInterface), + new ReorderPackagesCommandAction(networkIssueRation, networkIssueDuration, + networkIssueDelay, networkIssueTimeout, networkIssueInterface), + new FillDiskCommandAction(fillDiskFileSize, fillDiskIssueduration, fillDiskPath, + fillDiskTimeout)}; + + // Action to log more info for debugging + Action[] actions2 = new Action[] {new DumpClusterStatusAction()}; + + return new PolicyBasedChaosMonkey(util, new PeriodicRandomActionPolicy(action1Period, actions1), + new PeriodicRandomActionPolicy(action2Period, actions2)); + } + + private void loadProperties() { + action1Period = Long.parseLong(this.properties + .getProperty(MonkeyConstants.PERIODIC_ACTION1_PERIOD, + MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + "")); + action2Period = Long.parseLong(this.properties + .getProperty(MonkeyConstants.PERIODIC_ACTION2_PERIOD, + MonkeyConstants.DEFAULT_PERIODIC_ACTION2_PERIOD + "")); + cpuLoadDuration = Long.parseLong(this.properties.getProperty( + MonkeyConstants.CPU_LOAD_DURATION, + MonkeyConstants.DEFAULT_CPU_LOAD_DURATION + "")); + cpuLoadProcesses = Long.parseLong(this.properties.getProperty( + MonkeyConstants.CPU_LOAD_PROCESSES, + MonkeyConstants.DEFAULT_CPU_LOAD_PROCESSES + "")); + networkIssueTimeout = Long.parseLong(this.properties + .getProperty(MonkeyConstants.NETWORK_ISSUE_COMMAND_TIMEOUT, + MonkeyConstants.DEFAULT_NETWORK_ISSUE_COMMAND_TIMEOUT + "")); + networkIssueDuration = Long.parseLong(this.properties + .getProperty(MonkeyConstants.NETWORK_ISSUE_DURATION, + MonkeyConstants.DEFAULT_NETWORK_ISSUE_DURATION + "")); + networkIssueRation = Float.parseFloat(this.properties + .getProperty(MonkeyConstants.NETWORK_ISSUE_RATIO, + MonkeyConstants.DEFAULT_NETWORK_ISSUE_RATIO + "")); + networkIssueDelay = Long.parseLong(this.properties + .getProperty(MonkeyConstants.NETWORK_ISSUE_DELAY, + MonkeyConstants.DEFAULT_NETWORK_ISSUE_DELAY + "")); + networkIssueInterface = this.properties + .getProperty(MonkeyConstants.NETWORK_ISSUE_INTERFACE, + MonkeyConstants.DEFAULT_NETWORK_ISSUE_INTERFACE + ""); + fillDiskTimeout = Long.parseLong(this.properties + .getProperty(MonkeyConstants.FILL_DISK_COMMAND_TIMEOUT, + MonkeyConstants.DEFAULT_FILL_DISK_COMMAND_TIMEOUT + "")); + fillDiskPath = this.properties + .getProperty(MonkeyConstants.FILL_DISK_PATH, + MonkeyConstants.DEFAULT_FILL_DISK_PATH + ""); + fillDiskFileSize = Long.parseLong(this.properties + .getProperty(MonkeyConstants.FILL_DISK_FILE_SIZE, + MonkeyConstants.DEFAULT_FILL_DISK_FILE_SIZE + "")); + fillDiskIssueduration = Long.parseLong(this.properties + .getProperty(MonkeyConstants.FILL_DISK_ISSUE_DURATION, + MonkeyConstants.DEFAULT_FILL_DISK_ISSUE_DURATION + "")); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java index 9051e98ff2c..f4c34b59959 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java @@ -48,13 +48,26 @@ public interface MonkeyConstants { String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time"; String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time"; String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio"; + String CPU_LOAD_DURATION = "cpu.load.duration"; + String CPU_LOAD_PROCESSES = "cpu.load.processes"; + String NETWORK_ISSUE_COMMAND_TIMEOUT = "network.issue.command.timeout"; + String NETWORK_ISSUE_DURATION = "network.issueduration"; + String NETWORK_ISSUE_RATIO = "network.issue.ratio"; + String NETWORK_ISSUE_DELAY = "network.issue.delay"; + String NETWORK_ISSUE_INTERFACE = "network.issue.interface"; + //should be big enough to create the file + String FILL_DISK_COMMAND_TIMEOUT = "fill.disk.command.timeout"; + String FILL_DISK_PATH = "fill.disk.path"; + String FILL_DISK_FILE_SIZE = "fill.disk.file.size"; + String FILL_DISK_ISSUE_DURATION = "fill.disk.issue.duration"; + String DATA_ISSUE_CHANCE = "data.issue.chance"; /** * A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky. */ Set MONKEY_CONFIGURATION_KEY_PREFIXES = new HashSet<>( - Arrays.asList("sdm.", "move.", "restart.", "batch.", "rolling.", "compact.", - "unbalance.", "decrease.")); + Arrays.asList("sdm.", "move.", "restart.", "batch.", "rolling.", "compact.", "unbalance.", + "decrease.", "decrease.", "graceful.", "cpu.", "network.", "fill.", "data.")); long DEFAULT_PERIODIC_ACTION1_PERIOD = 60 * 1000; long DEFAULT_PERIODIC_ACTION2_PERIOD = 90 * 1000; @@ -81,4 +94,16 @@ public interface MonkeyConstants { long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000; long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000; float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f; + long DEFAULT_CPU_LOAD_DURATION = 5 * 60 * 1000; + long DEFAULT_CPU_LOAD_PROCESSES = 2; + long DEFAULT_NETWORK_ISSUE_COMMAND_TIMEOUT = 30 * 1000; + long DEFAULT_NETWORK_ISSUE_DURATION = 60 * 1000; + float DEFAULT_NETWORK_ISSUE_RATIO = 0.1f; + long DEFAULT_NETWORK_ISSUE_DELAY = 100; + String DEFAULT_NETWORK_ISSUE_INTERFACE = "eth0"; + long DEFAULT_FILL_DISK_COMMAND_TIMEOUT = 5 * 60 * 1000 + 30 * 1000;//duration + timeout + String DEFAULT_FILL_DISK_PATH = "/tmp"; + long DEFAULT_FILL_DISK_FILE_SIZE = 0; + long DEFAULT_FILL_DISK_ISSUE_DURATION = 5 * 60 * 1000; + float DEFAULT_DATA_ISSUE_CHANCE = 0.01f; } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java index f4492b3b024..73f69682440 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java @@ -77,6 +77,8 @@ public abstract class MonkeyFactory { public static final String MOB_NO_KILL = "mobNoKill"; public static final String MOB_SLOW_DETERMINISTIC = "mobSlowDeterministic"; public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling"; + public static final String DISTRIBUTED_ISSUES = "distributedIssues"; + public static final String DATA_ISSUES = "dataIssues"; public static Map FACTORIES = ImmutableMap.builder() .put(CALM, new CalmMonkeyFactory()) @@ -89,6 +91,8 @@ public abstract class MonkeyFactory { .put(MOB_NO_KILL, new MobNoKillMonkeyFactory()) .put(MOB_SLOW_DETERMINISTIC, new MobNoKillMonkeyFactory()) .put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory()) + .put(DISTRIBUTED_ISSUES, new DistributedIssuesMonkeyFactory()) + .put(DATA_ISSUES, new DataIssuesMonkeyFactory()) .build(); public static MonkeyFactory getFactory(String factoryName) { diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java index 2e763adbfd7..5cb2d7f7b8a 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java @@ -78,10 +78,10 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ "")); rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + "")); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java index 68d11f9a640..3f2edcc9f8f 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java @@ -74,10 +74,10 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory { MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ "")); rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + "")); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java index 22c35b96b95..deaf25640c8 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java @@ -191,10 +191,10 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory { MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ "")); rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + "")); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java index 4e304fbd2a6..b25bef7a334 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - *

+ * * http://www.apache.org/licenses/LICENSE-2.0 - *

+ * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -105,10 +105,10 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory { MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME, MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + "")); rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ "")); rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty( - MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO, - MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + "")); + MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO, + MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + "")); } }