HBASE-14261 Enhance Chaos Monkey framework by adding zookeeper and datanode fault injections.

This commit is contained in:
Srikanth Srungarapu 2015-09-03 11:49:40 -07:00
parent 90b8a3c894
commit 1717de65a4
13 changed files with 582 additions and 34 deletions

View File

@ -39,6 +39,7 @@ interface ClusterManager extends Configurable {
HADOOP_DATANODE("datanode"), HADOOP_DATANODE("datanode"),
HADOOP_JOBTRACKER("jobtracker"), HADOOP_JOBTRACKER("jobtracker"),
HADOOP_TASKTRACKER("tasktracker"), HADOOP_TASKTRACKER("tasktracker"),
ZOOKEEPER_SERVER("QuorumPeerMain"),
HBASE_MASTER("master"), HBASE_MASTER("master"),
HBASE_REGIONSERVER("regionserver"); HBASE_REGIONSERVER("regionserver");

View File

@ -114,16 +114,14 @@ public class DistributedHBaseCluster extends HBaseCluster {
public void killRegionServer(ServerName serverName) throws IOException { public void killRegionServer(ServerName serverName) throws IOException {
LOG.info("Aborting RS: " + serverName.getServerName()); LOG.info("Aborting RS: " + serverName.getServerName());
clusterManager.kill(ServiceType.HBASE_REGIONSERVER, clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
serverName.getHostname(), serverName.getHostname(), serverName.getPort());
serverName.getPort());
} }
@Override @Override
public void stopRegionServer(ServerName serverName) throws IOException { public void stopRegionServer(ServerName serverName) throws IOException {
LOG.info("Stopping RS: " + serverName.getServerName()); LOG.info("Stopping RS: " + serverName.getServerName());
clusterManager.stop(ServiceType.HBASE_REGIONSERVER, clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
serverName.getHostname(), serverName.getHostname(), serverName.getPort());
serverName.getPort());
} }
@Override @Override
@ -131,20 +129,96 @@ public class DistributedHBaseCluster extends HBaseCluster {
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout); waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
} }
@Override
public void startZkNode(String hostname, int port) throws IOException {
LOG.info("Starting Zookeeper node on: " + hostname);
clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
}
@Override
public void killZkNode(ServerName serverName) throws IOException {
LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
serverName.getHostname(), serverName.getPort());
}
@Override
public void stopZkNode(ServerName serverName) throws IOException {
LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
serverName.getHostname(), serverName.getPort());
}
@Override
public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
}
@Override
public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
}
@Override
public void startDataNode(ServerName serverName) throws IOException {
LOG.info("Starting data node on: " + serverName.getServerName());
clusterManager.start(ServiceType.HADOOP_DATANODE,
serverName.getHostname(), serverName.getPort());
}
@Override
public void killDataNode(ServerName serverName) throws IOException {
LOG.info("Aborting data node on: " + serverName.getServerName());
clusterManager.kill(ServiceType.HADOOP_DATANODE,
serverName.getHostname(), serverName.getPort());
}
@Override
public void stopDataNode(ServerName serverName) throws IOException {
LOG.info("Stopping data node on: " + serverName.getServerName());
clusterManager.stop(ServiceType.HADOOP_DATANODE,
serverName.getHostname(), serverName.getPort());
}
@Override
public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
}
@Override
public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
}
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
throws IOException { throws IOException {
LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName()); LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
while ((System.currentTimeMillis() - start) < timeout) { while ((System.currentTimeMillis() - start) < timeout) {
if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) { if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
return; return;
} }
Threads.sleep(1000); Threads.sleep(100);
} }
throw new IOException("did timeout waiting for service to stop:" + serverName); throw new IOException("did timeout waiting for service to stop:" + serverName);
} }
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
throws IOException {
LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
long start = System.currentTimeMillis();
while ((System.currentTimeMillis() - start) < timeout) {
if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
return;
}
Threads.sleep(100);
}
throw new IOException("did timeout waiting for service to start:" + serverName);
}
@Override @Override
public MasterService.BlockingInterface getMasterAdminService() public MasterService.BlockingInterface getMasterAdminService()
throws IOException { throws IOException {

View File

@ -54,9 +54,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
/** /**
* The command format that is used to execute the remote command. Arguments: * The command format that is used to execute the remote command. Arguments:
* 1 SSH options, 2 user name , 3 "@" if username is set, 4 host, 5 original command. * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host,
* 5 original command, 6 service user.
*/ */
private static final String DEFAULT_TUNNEL_CMD = "/usr/bin/ssh %1$s %2$s%3$s%4$s \"%5$s\""; private static final String DEFAULT_TUNNEL_CMD =
"/usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\"";
private String tunnelCmd; private String tunnelCmd;
private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts"; private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
@ -93,11 +95,24 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
.setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL))); .setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
} }
private String getServiceUser(ServiceType service) {
Configuration conf = getConf();
switch (service) {
case HADOOP_DATANODE:
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
case ZOOKEEPER_SERVER:
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
default:
return conf.get("hbase.it.clustermanager.hbase.user", "hbase");
}
}
/** /**
* Executes commands over SSH * Executes commands over SSH
*/ */
protected class RemoteShell extends Shell.ShellCommandExecutor { protected class RemoteShell extends Shell.ShellCommandExecutor {
private String hostname; private String hostname;
private String user;
public RemoteShell(String hostname, String[] execString, File dir, Map<String, String> env, public RemoteShell(String hostname, String[] execString, File dir, Map<String, String> env,
long timeout) { long timeout) {
@ -120,11 +135,17 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
this.hostname = hostname; this.hostname = hostname;
} }
public RemoteShell(String hostname, String user, String[] execString) {
super(execString);
this.hostname = hostname;
this.user = user;
}
@Override @Override
public String[] getExecString() { public String[] getExecString() {
String at = sshUserName.isEmpty() ? "" : "@"; String at = sshUserName.isEmpty() ? "" : "@";
String remoteCmd = StringUtils.join(super.getExecString(), " "); String remoteCmd = StringUtils.join(super.getExecString(), " ");
String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd); String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd, user);
LOG.info("Executing full command [" + cmd + "]"); LOG.info("Executing full command [" + cmd + "]");
return new String[] { "/usr/bin/env", "bash", "-c", cmd }; return new String[] { "/usr/bin/env", "bash", "-c", cmd };
} }
@ -188,24 +209,95 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
} }
} }
/**
* CommandProvider to manage the service using sbin/hadoop-* scripts.
*/
static class HadoopShellCommandProvider extends CommandProvider {
private final String hadoopHome;
private final String confDir;
HadoopShellCommandProvider(Configuration conf) throws IOException {
hadoopHome = conf.get("hbase.it.clustermanager.hadoop.home",
System.getenv("HADOOP_HOME"));
String tmp = conf.get("hbase.it.clustermanager.hadoop.conf.dir",
System.getenv("HADOOP_CONF_DIR"));
if (hadoopHome == null) {
throw new IOException("Hadoop home configuration parameter i.e. " +
"'hbase.it.clustermanager.hadoop.home' is not configured properly.");
}
if (tmp != null) {
confDir = String.format("--config %s", tmp);
} else {
confDir = "";
}
}
@Override
public String getCommand(ServiceType service, Operation op) {
return String.format("%s/sbin/hadoop-daemon.sh %s %s %s", hadoopHome, confDir,
op.toString().toLowerCase(), service);
}
}
/**
* CommandProvider to manage the service using bin/zk* scripts.
*/
static class ZookeeperShellCommandProvider extends CommandProvider {
private final String zookeeperHome;
private final String confDir;
ZookeeperShellCommandProvider(Configuration conf) throws IOException {
zookeeperHome = conf.get("hbase.it.clustermanager.zookeeper.home",
System.getenv("ZOOBINDIR"));
String tmp = conf.get("hbase.it.clustermanager.zookeeper.conf.dir",
System.getenv("ZOOCFGDIR"));
if (zookeeperHome == null) {
throw new IOException("Zookeeper home configuration parameter i.e. " +
"'hbase.it.clustermanager.zookeeper.home' is not configured properly.");
}
if (tmp != null) {
confDir = String.format("--config %s", tmp);
} else {
confDir = "";
}
}
@Override
public String getCommand(ServiceType service, Operation op) {
return String.format("%s/bin/zkServer.sh %s", zookeeperHome, op.toString().toLowerCase());
}
@Override
protected String findPidCommand(ServiceType service) {
return String.format("ps aux | grep %s | grep -v grep | tr -s ' ' | cut -d ' ' -f2",
service);
}
}
public HBaseClusterManager() { public HBaseClusterManager() {
} }
protected CommandProvider getCommandProvider(ServiceType service) { protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
//TODO: make it pluggable, or auto-detect the best command provider, should work with switch (service) {
//hadoop daemons as well case HADOOP_DATANODE:
return new HadoopShellCommandProvider(getConf());
case ZOOKEEPER_SERVER:
return new ZookeeperShellCommandProvider(getConf());
default:
return new HBaseShellCommandProvider(getConf()); return new HBaseShellCommandProvider(getConf());
} }
}
/** /**
* Execute the given command on the host using SSH * Execute the given command on the host using SSH
* @return pair of exit code and command output * @return pair of exit code and command output
* @throws IOException if something goes wrong. * @throws IOException if something goes wrong.
*/ */
private Pair<Integer, String> exec(String hostname, String... cmd) throws IOException { private Pair<Integer, String> exec(String hostname, ServiceType service, String... cmd)
throws IOException {
LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname); LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname);
RemoteShell shell = new RemoteShell(hostname, cmd); RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd);
try { try {
shell.execute(); shell.execute();
} catch (Shell.ExitCodeException ex) { } catch (Shell.ExitCodeException ex) {
@ -222,12 +314,12 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
return new Pair<Integer, String>(shell.getExitCode(), shell.getOutput()); return new Pair<Integer, String>(shell.getExitCode(), shell.getOutput());
} }
private Pair<Integer, String> execWithRetries(String hostname, String... cmd) private Pair<Integer, String> execWithRetries(String hostname, ServiceType service, String... cmd)
throws IOException { throws IOException {
RetryCounter retryCounter = retryCounterFactory.create(); RetryCounter retryCounter = retryCounterFactory.create();
while (true) { while (true) {
try { try {
return exec(hostname, cmd); return exec(hostname, service, cmd);
} catch (IOException e) { } catch (IOException e) {
retryOrThrow(retryCounter, e, hostname, cmd); retryOrThrow(retryCounter, e, hostname, cmd);
} }
@ -252,7 +344,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
} }
private void exec(String hostname, ServiceType service, Operation op) throws IOException { private void exec(String hostname, ServiceType service, Operation op) throws IOException {
execWithRetries(hostname, getCommandProvider(service).getCommand(service, op)); execWithRetries(hostname, service, getCommandProvider(service).getCommand(service, op));
} }
@Override @Override
@ -271,13 +363,13 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
} }
public void signal(ServiceType service, String signal, String hostname) throws IOException { public void signal(ServiceType service, String signal, String hostname) throws IOException {
execWithRetries(hostname, getCommandProvider(service).signalCommand(service, signal)); execWithRetries(hostname, service, getCommandProvider(service).signalCommand(service, signal));
} }
@Override @Override
public boolean isRunning(ServiceType service, String hostname, int port) throws IOException { public boolean isRunning(ServiceType service, String hostname, int port) throws IOException {
String ret = execWithRetries(hostname, getCommandProvider(service).isRunningCommand(service)) String ret = execWithRetries(hostname, service,
.getSecond(); getCommandProvider(service).isRunningCommand(service)).getSecond();
return ret.length() > 0; return ret.length() > 0;
} }

View File

@ -321,6 +321,7 @@ public class RESTApiClusterManager extends Configured implements ClusterManager
// The RoleCommand enum is used by the doRoleCommand method to guard against non-existent methods // The RoleCommand enum is used by the doRoleCommand method to guard against non-existent methods
// being invoked on a given role. // being invoked on a given role.
// TODO: Integrate zookeeper and hdfs related failure injections (Ref: HBASE-14261).
private enum RoleCommand { private enum RoleCommand {
START, STOP, RESTART; START, STOP, RESTART;

View File

@ -27,6 +27,7 @@ import java.util.List;
import org.apache.commons.lang.math.RandomUtils; import org.apache.commons.lang.math.RandomUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterStatus; import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HBaseCluster; import org.apache.hadoop.hbase.HBaseCluster;
import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionInfo;
@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
/** /**
@ -49,6 +49,14 @@ public class Action {
"hbase.chaosmonkey.action.startmastertimeout"; "hbase.chaosmonkey.action.startmastertimeout";
public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout"; public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout"; public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
public static final String KILL_ZK_NODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.killzknodetimeout";
public static final String START_ZK_NODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startzknodetimeout";
public static final String KILL_DATANODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.killdatanodetimeout";
public static final String START_DATANODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startdatanodetimeout";
protected static final Log LOG = LogFactory.getLog(Action.class); protected static final Log LOG = LogFactory.getLog(Action.class);
@ -56,6 +64,10 @@ public class Action {
protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected ActionContext context; protected ActionContext context;
protected HBaseCluster cluster; protected HBaseCluster cluster;
@ -66,6 +78,10 @@ public class Action {
protected long startMasterTimeout; protected long startMasterTimeout;
protected long killRsTimeout; protected long killRsTimeout;
protected long startRsTimeout; protected long startRsTimeout;
protected long killZkNodeTimeout;
protected long startZkNodeTimeout;
protected long killDataNodeTimeout;
protected long startDataNodeTimeout;
public void init(ActionContext context) throws IOException { public void init(ActionContext context) throws IOException {
this.context = context; this.context = context;
@ -80,6 +96,14 @@ public class Action {
START_MASTER_TIMEOUT_DEFAULT); START_MASTER_TIMEOUT_DEFAULT);
killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT); killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT); startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
killZkNodeTimeout = cluster.getConf().getLong(KILL_ZK_NODE_TIMEOUT_KEY,
KILL_ZK_NODE_TIMEOUT_DEFAULT);
startZkNodeTimeout = cluster.getConf().getLong(START_ZK_NODE_TIMEOUT_KEY,
START_ZK_NODE_TIMEOUT_DEFAULT);
killDataNodeTimeout = cluster.getConf().getLong(KILL_DATANODE_TIMEOUT_KEY,
KILL_DATANODE_TIMEOUT_DEFAULT);
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
START_DATANODE_TIMEOUT_DEFAULT);
} }
public void perform() throws Exception { } public void perform() throws Exception { }
@ -135,6 +159,36 @@ public class Action {
+ cluster.getClusterStatus().getServersSize()); + cluster.getClusterStatus().getServersSize());
} }
protected void killZKNode(ServerName server) throws IOException {
LOG.info("Killing zookeeper node:" + server);
cluster.killZkNode(server);
cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
LOG.info("Killed zookeeper node:" + server + ". Reported num of rs:"
+ cluster.getClusterStatus().getServersSize());
}
protected void startZKNode(ServerName server) throws IOException {
LOG.info("Starting zookeeper node:" + server.getHostname());
cluster.startZkNode(server.getHostname(), server.getPort());
cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
LOG.info("Started zookeeper node:" + server);
}
protected void killDataNode(ServerName server) throws IOException {
LOG.info("Killing datanode:" + server);
cluster.killDataNode(server);
cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
LOG.info("Killed datanode:" + server + ". Reported num of rs:"
+ cluster.getClusterStatus().getServersSize());
}
protected void startDataNode(ServerName server) throws IOException {
LOG.info("Starting datanode:" + server.getHostname());
cluster.startDataNode(server);
cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
LOG.info("Started datanode:" + server);
}
protected void unbalanceRegions(ClusterStatus clusterStatus, protected void unbalanceRegions(ClusterStatus clusterStatus,
List<ServerName> fromServers, List<ServerName> toServers, List<ServerName> fromServers, List<ServerName> toServers,
double fractionOfRegions) throws Exception { double fractionOfRegions) throws Exception {
@ -174,6 +228,10 @@ public class Action {
} }
} }
public Configuration getConf() {
return cluster.getConf();
}
/** /**
* Context for Action's * Context for Action's
*/ */

View File

@ -51,4 +51,18 @@ public class RestartActionBaseAction extends Action {
sleep(sleepTime); sleep(sleepTime);
startRs(server); startRs(server);
} }
void restartZKNode(ServerName server, long sleepTime) throws IOException {
sleepTime = Math.max(sleepTime, 1000);
killZKNode(server);
sleep(sleepTime);
startZKNode(server);
}
void restartDataNode(ServerName server, long sleepTime) throws IOException {
sleepTime = Math.max(sleepTime, 1000);
killDataNode(server);
sleep(sleepTime);
startDataNode(server);
}
} }

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
/**
* Action that restarts a random datanode.
*/
public class RestartRandomDataNodeAction extends RestartActionBaseAction {
public RestartRandomDataNodeAction(long sleepTime) {
super(sleepTime);
}
@Override
public void perform() throws Exception {
LOG.info("Performing action: Restart random data node");
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
restartDataNode(server, sleepTime);
}
public ServerName[] getDataNodes() throws IOException {
DistributedFileSystem fs = (DistributedFileSystem) FSUtils.getRootDir(getConf())
.getFileSystem(getConf());
DFSClient dfsClient = fs.getClient();
List<ServerName> hosts = new LinkedList<ServerName>();
for (DatanodeInfo dataNode: dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) {
hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1));
}
return hosts.toArray(new ServerName[hosts.size()]);
}
}

View File

@ -0,0 +1,40 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
import org.apache.hadoop.hbase.zookeeper.ZKServerTool;
/**
* Action that restarts a random zookeeper node.
*/
public class RestartRandomZKNodeAction extends RestartActionBaseAction {
public RestartRandomZKNodeAction(long sleepTime) {
super(sleepTime);
}
@Override
public void perform() throws Exception {
LOG.info("Performing action: Restart random zookeeper node");
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(
ZKServerTool.readZKNodes(getConf()));
restartZKNode(server, sleepTime);
}
}

View File

@ -70,12 +70,14 @@ public abstract class MonkeyFactory {
public static final String STRESS_AM = "stressAM"; public static final String STRESS_AM = "stressAM";
public static final String NO_KILL = "noKill"; public static final String NO_KILL = "noKill";
public static final String MASTER_KILLING = "masterKilling"; public static final String MASTER_KILLING = "masterKilling";
public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling";
public static Map<String, MonkeyFactory> FACTORIES = ImmutableMap.<String,MonkeyFactory>builder() public static Map<String, MonkeyFactory> FACTORIES = ImmutableMap.<String,MonkeyFactory>builder()
.put(CALM, new CalmMonkeyFactory()) .put(CALM, new CalmMonkeyFactory())
.put(SLOW_DETERMINISTIC, new SlowDeterministicMonkeyFactory()) .put(SLOW_DETERMINISTIC, new SlowDeterministicMonkeyFactory())
.put(UNBALANCE, new UnbalanceMonkeyFactory()) .put(UNBALANCE, new UnbalanceMonkeyFactory())
.put(SERVER_KILLING, new ServerKillingMonkeyFactory()) .put(SERVER_KILLING, new ServerKillingMonkeyFactory())
.put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory())
.put(STRESS_AM, new StressAssignmentManagerMonkeyFactory()) .put(STRESS_AM, new StressAssignmentManagerMonkeyFactory())
.put(NO_KILL, new NoKillMonkeyFactory()) .put(NO_KILL, new NoKillMonkeyFactory())
.put(MASTER_KILLING, new MasterKillingMonkeyFactory()) .put(MASTER_KILLING, new MasterKillingMonkeyFactory())

View File

@ -0,0 +1,65 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.factories;
import org.apache.hadoop.hbase.chaos.actions.Action;
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy;
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
/**
* Creates ChaosMonkeys for doing server restart actions, but not
* flush / compact / snapshot kind of actions.
*/
public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
@Override
public ChaosMonkey build() {
// Destructive actions to mess things around. Cannot run batch restart.
Action[] actions1 = new Action[]{
new RestartRandomRsExceptMetaAction(60000),
new RestartActiveMasterAction(5000),
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead.
new ForceBalancerAction(),
new RestartRandomDataNodeAction(60000),
new RestartRandomZKNodeAction(60000)
};
// Action to log more info for debugging
Action[] actions2 = new Action[]{
new DumpClusterStatusAction()
};
return new PolicyBasedChaosMonkey(util,
new CompositeSequentialPolicy(
new DoActionsOncePolicy(60 * 1000, actions1),
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
new PeriodicRandomActionPolicy(60 * 1000, actions2));
}
}

View File

@ -19,9 +19,13 @@
package org.apache.hadoop.hbase.zookeeper; package org.apache.hadoop.hbase.zookeeper;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties; import java.util.Properties;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseConfiguration;
@ -33,12 +37,10 @@ import org.apache.hadoop.hbase.HBaseInterfaceAudience;
*/ */
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS) @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
public class ZKServerTool { public class ZKServerTool {
/**
* Run the tool. public static ServerName[] readZKNodes(Configuration conf) {
* @param args Command line arguments. List<ServerName> hosts = new LinkedList<ServerName>();
*/
public static void main(String args[]) {
Configuration conf = HBaseConfiguration.create();
// Note that we do not simply grab the property // Note that we do not simply grab the property
// HConstants.ZOOKEEPER_QUORUM from the HBaseConfiguration because the // HConstants.ZOOKEEPER_QUORUM from the HBaseConfiguration because the
// user may be using a zoo.cfg file. // user may be using a zoo.cfg file.
@ -49,8 +51,24 @@ public class ZKServerTool {
if (key.startsWith("server.")) { if (key.startsWith("server.")) {
String[] parts = value.split(":"); String[] parts = value.split(":");
String host = parts[0]; String host = parts[0];
System.out.println("ZK host:" + host);
int port = HConstants.DEFAULT_ZOOKEPER_CLIENT_PORT;
if (parts.length > 1) {
port = Integer.parseInt(parts[1]);
} }
hosts.add(ServerName.valueOf(host, port, -1));
}
}
return hosts.toArray(new ServerName[hosts.size()]);
}
/**
* Run the tool.
* @param args Command line arguments.
*/
public static void main(String args[]) {
for(ServerName server: readZKNodes(HBaseConfiguration.create())) {
System.out.println("Zk host: " + server.getHostname());
} }
} }
} }

View File

@ -163,6 +163,81 @@ public abstract class HBaseCluster implements Closeable, Configurable {
public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
throws IOException; throws IOException;
/**
* Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
* silently logs warning message.
* @param hostname the hostname to start the regionserver on
* @throws IOException if something goes wrong
*/
public abstract void startZkNode(String hostname, int port) throws IOException;
/**
* Kills the zookeeper node process if this is a distributed cluster, otherwise,
* this causes master to exit doing basic clean up only.
* @throws IOException if something goes wrong
*/
public abstract void killZkNode(ServerName serverName) throws IOException;
/**
* Stops the region zookeeper if this is a distributed cluster, otherwise
* silently logs warning message.
* @throws IOException if something goes wrong
*/
public abstract void stopZkNode(ServerName serverName) throws IOException;
/**
* Wait for the specified zookeeper node to join the cluster
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForZkNodeToStart(ServerName serverName, long timeout)
throws IOException;
/**
* Wait for the specified zookeeper node to stop the thread / process.
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForZkNodeToStop(ServerName serverName, long timeout)
throws IOException;
/**
* Starts a new datanode on the given hostname or if this is a mini/local cluster,
* silently logs warning message.
* @throws IOException if something goes wrong
*/
public abstract void startDataNode(ServerName serverName) throws IOException;
/**
* Kills the datanode process if this is a distributed cluster, otherwise,
* this causes master to exit doing basic clean up only.
* @throws IOException if something goes wrong
*/
public abstract void killDataNode(ServerName serverName) throws IOException;
/**
* Stops the datanode if this is a distributed cluster, otherwise
* silently logs warning message.
* @throws IOException if something goes wrong
*/
public abstract void stopDataNode(ServerName serverName) throws IOException;
/**
* Wait for the specified datanode to join the cluster
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
throws IOException;
/**
* Wait for the specified datanode to stop the thread / process.
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
throws IOException;
/** /**
* Starts a new master on the given hostname or if this is a mini/local cluster, * Starts a new master on the given hostname or if this is a mini/local cluster,
* starts a master locally. * starts a master locally.

View File

@ -260,6 +260,56 @@ public class MiniHBaseCluster extends HBaseCluster {
waitOnRegionServer(getRegionServerIndex(serverName)); waitOnRegionServer(getRegionServerIndex(serverName));
} }
@Override
public void startZkNode(String hostname, int port) throws IOException {
LOG.warn("Starting zookeeper nodes on mini cluster is not supported");
}
@Override
public void killZkNode(ServerName serverName) throws IOException {
LOG.warn("Aborting zookeeper nodes on mini cluster is not supported");
}
@Override
public void stopZkNode(ServerName serverName) throws IOException {
LOG.warn("Stopping zookeeper nodes on mini cluster is not supported");
}
@Override
public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for zookeeper nodes to start on mini cluster is not supported");
}
@Override
public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for zookeeper nodes to stop on mini cluster is not supported");
}
@Override
public void startDataNode(ServerName serverName) throws IOException {
LOG.warn("Starting datanodes on mini cluster is not supported");
}
@Override
public void killDataNode(ServerName serverName) throws IOException {
LOG.warn("Aborting datanodes on mini cluster is not supported");
}
@Override
public void stopDataNode(ServerName serverName) throws IOException {
LOG.warn("Stopping datanodes on mini cluster is not supported");
}
@Override
public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for datanodes to start on mini cluster is not supported");
}
@Override
public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
}
@Override @Override
public void startMaster(String hostname, int port) throws IOException { public void startMaster(String hostname, int port) throws IOException {
this.startMaster(); this.startMaster();