HBASE-14261 Enhance Chaos Monkey framework by adding zookeeper and datanode fault injections.
This commit is contained in:
parent
3341f13e71
commit
e48991970d
|
@ -39,6 +39,7 @@ interface ClusterManager extends Configurable {
|
|||
HADOOP_DATANODE("datanode"),
|
||||
HADOOP_JOBTRACKER("jobtracker"),
|
||||
HADOOP_TASKTRACKER("tasktracker"),
|
||||
ZOOKEEPER_SERVER("QuorumPeerMain"),
|
||||
HBASE_MASTER("master"),
|
||||
HBASE_REGIONSERVER("regionserver");
|
||||
|
||||
|
|
|
@ -114,16 +114,14 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
public void killRegionServer(ServerName serverName) throws IOException {
|
||||
LOG.info("Aborting RS: " + serverName.getServerName());
|
||||
clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
|
||||
serverName.getHostname(),
|
||||
serverName.getPort());
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopRegionServer(ServerName serverName) throws IOException {
|
||||
LOG.info("Stopping RS: " + serverName.getServerName());
|
||||
clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
|
||||
serverName.getHostname(),
|
||||
serverName.getPort());
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -131,20 +129,96 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startZkNode(String hostname, int port) throws IOException {
|
||||
LOG.info("Starting Zookeeper node on: " + hostname);
|
||||
clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void killZkNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
|
||||
clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopZkNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
|
||||
clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||
waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||
waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDataNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Starting data node on: " + serverName.getServerName());
|
||||
clusterManager.start(ServiceType.HADOOP_DATANODE,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void killDataNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Aborting data node on: " + serverName.getServerName());
|
||||
clusterManager.kill(ServiceType.HADOOP_DATANODE,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopDataNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Stopping data node on: " + serverName.getServerName());
|
||||
clusterManager.stop(ServiceType.HADOOP_DATANODE,
|
||||
serverName.getHostname(), serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||
waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
|
||||
}
|
||||
|
||||
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
||||
throws IOException {
|
||||
LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
|
||||
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
while ((System.currentTimeMillis() - start) < timeout) {
|
||||
if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
|
||||
return;
|
||||
}
|
||||
Threads.sleep(1000);
|
||||
Threads.sleep(100);
|
||||
}
|
||||
throw new IOException("did timeout waiting for service to stop:" + serverName);
|
||||
}
|
||||
|
||||
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
|
||||
throws IOException {
|
||||
LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
while ((System.currentTimeMillis() - start) < timeout) {
|
||||
if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
|
||||
return;
|
||||
}
|
||||
Threads.sleep(100);
|
||||
}
|
||||
throw new IOException("did timeout waiting for service to start:" + serverName);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public MasterService.BlockingInterface getMasterAdminService()
|
||||
throws IOException {
|
||||
|
|
|
@ -54,9 +54,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
|
||||
/**
|
||||
* The command format that is used to execute the remote command. Arguments:
|
||||
* 1 SSH options, 2 user name , 3 "@" if username is set, 4 host, 5 original command.
|
||||
* 1 SSH options, 2 user name , 3 "@" if username is set, 4 host,
|
||||
* 5 original command, 6 service user.
|
||||
*/
|
||||
private static final String DEFAULT_TUNNEL_CMD = "/usr/bin/ssh %1$s %2$s%3$s%4$s \"%5$s\"";
|
||||
private static final String DEFAULT_TUNNEL_CMD =
|
||||
"/usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\"";
|
||||
private String tunnelCmd;
|
||||
|
||||
private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
|
||||
|
@ -93,11 +95,24 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
.setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
|
||||
}
|
||||
|
||||
private String getServiceUser(ServiceType service) {
|
||||
Configuration conf = getConf();
|
||||
switch (service) {
|
||||
case HADOOP_DATANODE:
|
||||
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
|
||||
case ZOOKEEPER_SERVER:
|
||||
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
|
||||
default:
|
||||
return conf.get("hbase.it.clustermanager.hbase.user", "hbase");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes commands over SSH
|
||||
*/
|
||||
protected class RemoteShell extends Shell.ShellCommandExecutor {
|
||||
private String hostname;
|
||||
private String user;
|
||||
|
||||
public RemoteShell(String hostname, String[] execString, File dir, Map<String, String> env,
|
||||
long timeout) {
|
||||
|
@ -120,11 +135,17 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
this.hostname = hostname;
|
||||
}
|
||||
|
||||
public RemoteShell(String hostname, String user, String[] execString) {
|
||||
super(execString);
|
||||
this.hostname = hostname;
|
||||
this.user = user;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getExecString() {
|
||||
String at = sshUserName.isEmpty() ? "" : "@";
|
||||
String remoteCmd = StringUtils.join(super.getExecString(), " ");
|
||||
String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd);
|
||||
String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd, user);
|
||||
LOG.info("Executing full command [" + cmd + "]");
|
||||
return new String[] { "/usr/bin/env", "bash", "-c", cmd };
|
||||
}
|
||||
|
@ -188,13 +209,83 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* CommandProvider to manage the service using sbin/hadoop-* scripts.
|
||||
*/
|
||||
static class HadoopShellCommandProvider extends CommandProvider {
|
||||
private final String hadoopHome;
|
||||
private final String confDir;
|
||||
|
||||
HadoopShellCommandProvider(Configuration conf) throws IOException {
|
||||
hadoopHome = conf.get("hbase.it.clustermanager.hadoop.home",
|
||||
System.getenv("HADOOP_HOME"));
|
||||
String tmp = conf.get("hbase.it.clustermanager.hadoop.conf.dir",
|
||||
System.getenv("HADOOP_CONF_DIR"));
|
||||
if (hadoopHome == null) {
|
||||
throw new IOException("Hadoop home configuration parameter i.e. " +
|
||||
"'hbase.it.clustermanager.hadoop.home' is not configured properly.");
|
||||
}
|
||||
if (tmp != null) {
|
||||
confDir = String.format("--config %s", tmp);
|
||||
} else {
|
||||
confDir = "";
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getCommand(ServiceType service, Operation op) {
|
||||
return String.format("%s/sbin/hadoop-daemon.sh %s %s %s", hadoopHome, confDir,
|
||||
op.toString().toLowerCase(), service);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* CommandProvider to manage the service using bin/zk* scripts.
|
||||
*/
|
||||
static class ZookeeperShellCommandProvider extends CommandProvider {
|
||||
private final String zookeeperHome;
|
||||
private final String confDir;
|
||||
|
||||
ZookeeperShellCommandProvider(Configuration conf) throws IOException {
|
||||
zookeeperHome = conf.get("hbase.it.clustermanager.zookeeper.home",
|
||||
System.getenv("ZOOBINDIR"));
|
||||
String tmp = conf.get("hbase.it.clustermanager.zookeeper.conf.dir",
|
||||
System.getenv("ZOOCFGDIR"));
|
||||
if (zookeeperHome == null) {
|
||||
throw new IOException("Zookeeper home configuration parameter i.e. " +
|
||||
"'hbase.it.clustermanager.zookeeper.home' is not configured properly.");
|
||||
}
|
||||
if (tmp != null) {
|
||||
confDir = String.format("--config %s", tmp);
|
||||
} else {
|
||||
confDir = "";
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getCommand(ServiceType service, Operation op) {
|
||||
return String.format("%s/bin/zkServer.sh %s", zookeeperHome, op.toString().toLowerCase());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String findPidCommand(ServiceType service) {
|
||||
return String.format("ps aux | grep %s | grep -v grep | tr -s ' ' | cut -d ' ' -f2",
|
||||
service);
|
||||
}
|
||||
}
|
||||
|
||||
public HBaseClusterManager() {
|
||||
}
|
||||
|
||||
protected CommandProvider getCommandProvider(ServiceType service) {
|
||||
//TODO: make it pluggable, or auto-detect the best command provider, should work with
|
||||
//hadoop daemons as well
|
||||
return new HBaseShellCommandProvider(getConf());
|
||||
protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
|
||||
switch (service) {
|
||||
case HADOOP_DATANODE:
|
||||
return new HadoopShellCommandProvider(getConf());
|
||||
case ZOOKEEPER_SERVER:
|
||||
return new ZookeeperShellCommandProvider(getConf());
|
||||
default:
|
||||
return new HBaseShellCommandProvider(getConf());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -202,10 +293,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
* @return pair of exit code and command output
|
||||
* @throws IOException if something goes wrong.
|
||||
*/
|
||||
private Pair<Integer, String> exec(String hostname, String... cmd) throws IOException {
|
||||
private Pair<Integer, String> exec(String hostname, ServiceType service, String... cmd)
|
||||
throws IOException {
|
||||
LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname);
|
||||
|
||||
RemoteShell shell = new RemoteShell(hostname, cmd);
|
||||
RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd);
|
||||
try {
|
||||
shell.execute();
|
||||
} catch (Shell.ExitCodeException ex) {
|
||||
|
@ -222,12 +314,12 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
return new Pair<Integer, String>(shell.getExitCode(), shell.getOutput());
|
||||
}
|
||||
|
||||
private Pair<Integer, String> execWithRetries(String hostname, String... cmd)
|
||||
private Pair<Integer, String> execWithRetries(String hostname, ServiceType service, String... cmd)
|
||||
throws IOException {
|
||||
RetryCounter retryCounter = retryCounterFactory.create();
|
||||
while (true) {
|
||||
try {
|
||||
return exec(hostname, cmd);
|
||||
return exec(hostname, service, cmd);
|
||||
} catch (IOException e) {
|
||||
retryOrThrow(retryCounter, e, hostname, cmd);
|
||||
}
|
||||
|
@ -252,7 +344,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
}
|
||||
|
||||
private void exec(String hostname, ServiceType service, Operation op) throws IOException {
|
||||
execWithRetries(hostname, getCommandProvider(service).getCommand(service, op));
|
||||
execWithRetries(hostname, service, getCommandProvider(service).getCommand(service, op));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -271,13 +363,13 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
}
|
||||
|
||||
public void signal(ServiceType service, String signal, String hostname) throws IOException {
|
||||
execWithRetries(hostname, getCommandProvider(service).signalCommand(service, signal));
|
||||
execWithRetries(hostname, service, getCommandProvider(service).signalCommand(service, signal));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isRunning(ServiceType service, String hostname, int port) throws IOException {
|
||||
String ret = execWithRetries(hostname, getCommandProvider(service).isRunningCommand(service))
|
||||
.getSecond();
|
||||
String ret = execWithRetries(hostname, service,
|
||||
getCommandProvider(service).isRunningCommand(service)).getSecond();
|
||||
return ret.length() > 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -321,6 +321,7 @@ public class RESTApiClusterManager extends Configured implements ClusterManager
|
|||
|
||||
// The RoleCommand enum is used by the doRoleCommand method to guard against non-existent methods
|
||||
// being invoked on a given role.
|
||||
// TODO: Integrate zookeeper and hdfs related failure injections (Ref: HBASE-14261).
|
||||
private enum RoleCommand {
|
||||
START, STOP, RESTART;
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.List;
|
|||
import org.apache.commons.lang.math.RandomUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.ClusterStatus;
|
||||
import org.apache.hadoop.hbase.HBaseCluster;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
|
@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.ServerLoad;
|
|||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
||||
/**
|
||||
|
@ -44,11 +44,19 @@ import org.apache.hadoop.hbase.util.Bytes;
|
|||
public class Action {
|
||||
|
||||
public static final String KILL_MASTER_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.killmastertimeout";
|
||||
"hbase.chaosmonkey.action.killmastertimeout";
|
||||
public static final String START_MASTER_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.startmastertimeout";
|
||||
"hbase.chaosmonkey.action.startmastertimeout";
|
||||
public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
|
||||
public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
|
||||
public static final String KILL_ZK_NODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.killzknodetimeout";
|
||||
public static final String START_ZK_NODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.startzknodetimeout";
|
||||
public static final String KILL_DATANODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.killdatanodetimeout";
|
||||
public static final String START_DATANODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.startdatanodetimeout";
|
||||
|
||||
protected static final Log LOG = LogFactory.getLog(Action.class);
|
||||
|
||||
|
@ -56,6 +64,10 @@ public class Action {
|
|||
protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long KILL_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
|
||||
protected ActionContext context;
|
||||
protected HBaseCluster cluster;
|
||||
|
@ -66,6 +78,10 @@ public class Action {
|
|||
protected long startMasterTimeout;
|
||||
protected long killRsTimeout;
|
||||
protected long startRsTimeout;
|
||||
protected long killZkNodeTimeout;
|
||||
protected long startZkNodeTimeout;
|
||||
protected long killDataNodeTimeout;
|
||||
protected long startDataNodeTimeout;
|
||||
|
||||
public void init(ActionContext context) throws IOException {
|
||||
this.context = context;
|
||||
|
@ -75,11 +91,19 @@ public class Action {
|
|||
initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
|
||||
|
||||
killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
|
||||
KILL_MASTER_TIMEOUT_DEFAULT);
|
||||
KILL_MASTER_TIMEOUT_DEFAULT);
|
||||
startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
|
||||
START_MASTER_TIMEOUT_DEFAULT);
|
||||
START_MASTER_TIMEOUT_DEFAULT);
|
||||
killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
|
||||
startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
|
||||
killZkNodeTimeout = cluster.getConf().getLong(KILL_ZK_NODE_TIMEOUT_KEY,
|
||||
KILL_ZK_NODE_TIMEOUT_DEFAULT);
|
||||
startZkNodeTimeout = cluster.getConf().getLong(START_ZK_NODE_TIMEOUT_KEY,
|
||||
START_ZK_NODE_TIMEOUT_DEFAULT);
|
||||
killDataNodeTimeout = cluster.getConf().getLong(KILL_DATANODE_TIMEOUT_KEY,
|
||||
KILL_DATANODE_TIMEOUT_DEFAULT);
|
||||
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
|
||||
START_DATANODE_TIMEOUT_DEFAULT);
|
||||
}
|
||||
|
||||
public void perform() throws Exception { }
|
||||
|
@ -132,7 +156,37 @@ public class Action {
|
|||
cluster.startRegionServer(server.getHostname(), server.getPort());
|
||||
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
|
||||
LOG.info("Started region server:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
|
||||
protected void killZKNode(ServerName server) throws IOException {
|
||||
LOG.info("Killing zookeeper node:" + server);
|
||||
cluster.killZkNode(server);
|
||||
cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
|
||||
LOG.info("Killed zookeeper node:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
|
||||
protected void startZKNode(ServerName server) throws IOException {
|
||||
LOG.info("Starting zookeeper node:" + server.getHostname());
|
||||
cluster.startZkNode(server.getHostname(), server.getPort());
|
||||
cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
|
||||
LOG.info("Started zookeeper node:" + server);
|
||||
}
|
||||
|
||||
protected void killDataNode(ServerName server) throws IOException {
|
||||
LOG.info("Killing datanode:" + server);
|
||||
cluster.killDataNode(server);
|
||||
cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
|
||||
LOG.info("Killed datanode:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
|
||||
protected void startDataNode(ServerName server) throws IOException {
|
||||
LOG.info("Starting datanode:" + server.getHostname());
|
||||
cluster.startDataNode(server);
|
||||
cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
|
||||
LOG.info("Started datanode:" + server);
|
||||
}
|
||||
|
||||
protected void unbalanceRegions(ClusterStatus clusterStatus,
|
||||
|
@ -174,6 +228,10 @@ public class Action {
|
|||
}
|
||||
}
|
||||
|
||||
public Configuration getConf() {
|
||||
return cluster.getConf();
|
||||
}
|
||||
|
||||
/**
|
||||
* Context for Action's
|
||||
*/
|
||||
|
|
|
@ -51,4 +51,18 @@ public class RestartActionBaseAction extends Action {
|
|||
sleep(sleepTime);
|
||||
startRs(server);
|
||||
}
|
||||
|
||||
void restartZKNode(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
killZKNode(server);
|
||||
sleep(sleepTime);
|
||||
startZKNode(server);
|
||||
}
|
||||
|
||||
void restartDataNode(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
killDataNode(server);
|
||||
sleep(sleepTime);
|
||||
startDataNode(server);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hdfs.DFSClient;
|
||||
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Action that restarts a random datanode.
|
||||
*/
|
||||
public class RestartRandomDataNodeAction extends RestartActionBaseAction {
|
||||
public RestartRandomDataNodeAction(long sleepTime) {
|
||||
super(sleepTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
LOG.info("Performing action: Restart random data node");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
|
||||
restartDataNode(server, sleepTime);
|
||||
}
|
||||
|
||||
public ServerName[] getDataNodes() throws IOException {
|
||||
DistributedFileSystem fs = (DistributedFileSystem) FSUtils.getRootDir(getConf())
|
||||
.getFileSystem(getConf());
|
||||
DFSClient dfsClient = fs.getClient();
|
||||
List<ServerName> hosts = new LinkedList<ServerName>();
|
||||
for (DatanodeInfo dataNode: dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) {
|
||||
hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1));
|
||||
}
|
||||
return hosts.toArray(new ServerName[hosts.size()]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKServerTool;
|
||||
|
||||
/**
|
||||
* Action that restarts a random zookeeper node.
|
||||
*/
|
||||
public class RestartRandomZKNodeAction extends RestartActionBaseAction {
|
||||
public RestartRandomZKNodeAction(long sleepTime) {
|
||||
super(sleepTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
LOG.info("Performing action: Restart random zookeeper node");
|
||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(
|
||||
ZKServerTool.readZKNodes(getConf()));
|
||||
restartZKNode(server, sleepTime);
|
||||
}
|
||||
}
|
|
@ -72,6 +72,7 @@ public abstract class MonkeyFactory {
|
|||
public static final String MASTER_KILLING = "masterKilling";
|
||||
public static final String MOB_NO_KILL = "mobNoKill";
|
||||
public static final String MOB_SLOW_DETERMINISTIC = "mobSlowDeterministic";
|
||||
public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling";
|
||||
|
||||
public static Map<String, MonkeyFactory> FACTORIES = ImmutableMap.<String,MonkeyFactory>builder()
|
||||
.put(CALM, new CalmMonkeyFactory())
|
||||
|
@ -83,6 +84,7 @@ public abstract class MonkeyFactory {
|
|||
.put(MASTER_KILLING, new MasterKillingMonkeyFactory())
|
||||
.put(MOB_NO_KILL, new MobNoKillMonkeyFactory())
|
||||
.put(MOB_SLOW_DETERMINISTIC, new MobNoKillMonkeyFactory())
|
||||
.put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory())
|
||||
.build();
|
||||
|
||||
public static MonkeyFactory getFactory(String factoryName) {
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.factories;
|
||||
|
||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
|
||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||
import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy;
|
||||
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
||||
|
||||
/**
|
||||
* Creates ChaosMonkeys for doing server restart actions, but not
|
||||
* flush / compact / snapshot kind of actions.
|
||||
*/
|
||||
public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
||||
|
||||
@Override
|
||||
public ChaosMonkey build() {
|
||||
|
||||
// Destructive actions to mess things around. Cannot run batch restart.
|
||||
Action[] actions1 = new Action[]{
|
||||
new RestartRandomRsExceptMetaAction(60000),
|
||||
new RestartActiveMasterAction(5000),
|
||||
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead.
|
||||
new ForceBalancerAction(),
|
||||
new RestartRandomDataNodeAction(60000),
|
||||
new RestartRandomZKNodeAction(60000)
|
||||
};
|
||||
|
||||
// Action to log more info for debugging
|
||||
Action[] actions2 = new Action[]{
|
||||
new DumpClusterStatusAction()
|
||||
};
|
||||
|
||||
return new PolicyBasedChaosMonkey(util,
|
||||
new CompositeSequentialPolicy(
|
||||
new DoActionsOncePolicy(60 * 1000, actions1),
|
||||
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
||||
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
||||
}
|
||||
}
|
|
@ -23,27 +23,42 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Tool for reading ZooKeeper servers from HBase XML configuration and producing
|
||||
* a line-by-line list for use by bash scripts.
|
||||
*/
|
||||
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
|
||||
public class ZKServerTool {
|
||||
/**
|
||||
* Run the tool.
|
||||
* @param args Command line arguments.
|
||||
*/
|
||||
public static void main(String args[]) {
|
||||
Configuration conf = HBaseConfiguration.create();
|
||||
public static ServerName[] readZKNodes(Configuration conf) {
|
||||
List<ServerName> hosts = new LinkedList<ServerName>();
|
||||
String quorum = conf.get(HConstants.ZOOKEEPER_QUORUM, HConstants.LOCALHOST);
|
||||
|
||||
String[] values = quorum.split(",");
|
||||
for (String value : values) {
|
||||
String[] parts = value.split(":");
|
||||
String host = parts[0];
|
||||
System.out.println("ZK host:" + host);
|
||||
int port = HConstants.DEFAULT_ZOOKEPER_CLIENT_PORT;
|
||||
if (parts.length > 1) {
|
||||
port = Integer.parseInt(parts[1]);
|
||||
}
|
||||
hosts.add(ServerName.valueOf(host, port, -1));
|
||||
}
|
||||
return hosts.toArray(new ServerName[hosts.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the tool.
|
||||
* @param args Command line arguments.
|
||||
*/
|
||||
public static void main(String args[]) {
|
||||
for(ServerName server: readZKNodes(HBaseConfiguration.create())) {
|
||||
System.out.println("Zk host: " + server.getHostname());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -163,6 +163,81 @@ public abstract class HBaseCluster implements Closeable, Configurable {
|
|||
public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
|
||||
* silently logs warning message.
|
||||
* @param hostname the hostname to start the regionserver on
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void startZkNode(String hostname, int port) throws IOException;
|
||||
|
||||
/**
|
||||
* Kills the zookeeper node process if this is a distributed cluster, otherwise,
|
||||
* this causes master to exit doing basic clean up only.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void killZkNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Stops the region zookeeper if this is a distributed cluster, otherwise
|
||||
* silently logs warning message.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void stopZkNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Wait for the specified zookeeper node to join the cluster
|
||||
* @return whether the operation finished with success
|
||||
* @throws IOException if something goes wrong or timeout occurs
|
||||
*/
|
||||
public abstract void waitForZkNodeToStart(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Wait for the specified zookeeper node to stop the thread / process.
|
||||
* @return whether the operation finished with success
|
||||
* @throws IOException if something goes wrong or timeout occurs
|
||||
*/
|
||||
public abstract void waitForZkNodeToStop(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Starts a new datanode on the given hostname or if this is a mini/local cluster,
|
||||
* silently logs warning message.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void startDataNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Kills the datanode process if this is a distributed cluster, otherwise,
|
||||
* this causes master to exit doing basic clean up only.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void killDataNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Stops the datanode if this is a distributed cluster, otherwise
|
||||
* silently logs warning message.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void stopDataNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Wait for the specified datanode to join the cluster
|
||||
* @return whether the operation finished with success
|
||||
* @throws IOException if something goes wrong or timeout occurs
|
||||
*/
|
||||
public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Wait for the specified datanode to stop the thread / process.
|
||||
* @return whether the operation finished with success
|
||||
* @throws IOException if something goes wrong or timeout occurs
|
||||
*/
|
||||
public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Starts a new master on the given hostname or if this is a mini/local cluster,
|
||||
* starts a master locally.
|
||||
|
|
|
@ -260,6 +260,56 @@ public class MiniHBaseCluster extends HBaseCluster {
|
|||
waitOnRegionServer(getRegionServerIndex(serverName));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startZkNode(String hostname, int port) throws IOException {
|
||||
LOG.warn("Starting zookeeper nodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void killZkNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Aborting zookeeper nodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopZkNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Stopping zookeeper nodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||
LOG.warn("Waiting for zookeeper nodes to start on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||
LOG.warn("Waiting for zookeeper nodes to stop on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDataNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Starting datanodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void killDataNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Aborting datanodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopDataNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Stopping datanodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||
LOG.warn("Waiting for datanodes to start on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startMaster(String hostname, int port) throws IOException {
|
||||
this.startMaster();
|
||||
|
|
Loading…
Reference in New Issue