diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java index 2d4627938b9..db5f8d85aa2 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java @@ -39,6 +39,7 @@ interface ClusterManager extends Configurable { HADOOP_DATANODE("datanode"), HADOOP_JOBTRACKER("jobtracker"), HADOOP_TASKTRACKER("tasktracker"), + ZOOKEEPER_SERVER("QuorumPeerMain"), HBASE_MASTER("master"), HBASE_REGIONSERVER("regionserver"); diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index 6e7cd33e615..07ca5ec8d88 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -114,16 +114,14 @@ public class DistributedHBaseCluster extends HBaseCluster { public void killRegionServer(ServerName serverName) throws IOException { LOG.info("Aborting RS: " + serverName.getServerName()); clusterManager.kill(ServiceType.HBASE_REGIONSERVER, - serverName.getHostname(), - serverName.getPort()); + serverName.getHostname(), serverName.getPort()); } @Override public void stopRegionServer(ServerName serverName) throws IOException { LOG.info("Stopping RS: " + serverName.getServerName()); clusterManager.stop(ServiceType.HBASE_REGIONSERVER, - serverName.getHostname(), - serverName.getPort()); + serverName.getHostname(), serverName.getPort()); } @Override @@ -131,20 +129,96 @@ public class DistributedHBaseCluster extends HBaseCluster { waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout); } + @Override + public void startZkNode(String hostname, int port) throws IOException { + LOG.info("Starting Zookeeper node on: " + hostname); + clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port); + } + + @Override + public void killZkNode(ServerName serverName) throws IOException { + LOG.info("Aborting Zookeeper node on: " + serverName.getServerName()); + clusterManager.kill(ServiceType.ZOOKEEPER_SERVER, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void stopZkNode(ServerName serverName) throws IOException { + LOG.info("Stopping Zookeeper node: " + serverName.getServerName()); + clusterManager.stop(ServiceType.ZOOKEEPER_SERVER, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException { + waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout); + } + + @Override + public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException { + waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout); + } + + @Override + public void startDataNode(ServerName serverName) throws IOException { + LOG.info("Starting data node on: " + serverName.getServerName()); + clusterManager.start(ServiceType.HADOOP_DATANODE, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void killDataNode(ServerName serverName) throws IOException { + LOG.info("Aborting data node on: " + serverName.getServerName()); + clusterManager.kill(ServiceType.HADOOP_DATANODE, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void stopDataNode(ServerName serverName) throws IOException { + LOG.info("Stopping data node on: " + serverName.getServerName()); + clusterManager.stop(ServiceType.HADOOP_DATANODE, + serverName.getHostname(), serverName.getPort()); + } + + @Override + public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException { + waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout); + } + + @Override + public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException { + waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout); + } + private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) throws IOException { - LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName()); + LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); long start = System.currentTimeMillis(); while ((System.currentTimeMillis() - start) < timeout) { if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) { return; } - Threads.sleep(1000); + Threads.sleep(100); } throw new IOException("did timeout waiting for service to stop:" + serverName); } + private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout) + throws IOException { + LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName()); + long start = System.currentTimeMillis(); + + while ((System.currentTimeMillis() - start) < timeout) { + if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) { + return; + } + Threads.sleep(100); + } + throw new IOException("did timeout waiting for service to start:" + serverName); + } + + @Override public MasterService.BlockingInterface getMasterAdminService() throws IOException { diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java index 8bdb5d6014a..c49ae44219e 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java @@ -54,9 +54,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager { /** * The command format that is used to execute the remote command. Arguments: - * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host, 5 original command. + * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host, + * 5 original command, 6 service user. */ - private static final String DEFAULT_TUNNEL_CMD = "/usr/bin/ssh %1$s %2$s%3$s%4$s \"%5$s\""; + private static final String DEFAULT_TUNNEL_CMD = + "/usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\""; private String tunnelCmd; private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts"; @@ -93,11 +95,24 @@ public class HBaseClusterManager extends Configured implements ClusterManager { .setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL))); } + private String getServiceUser(ServiceType service) { + Configuration conf = getConf(); + switch (service) { + case HADOOP_DATANODE: + return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs"); + case ZOOKEEPER_SERVER: + return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper"); + default: + return conf.get("hbase.it.clustermanager.hbase.user", "hbase"); + } + } + /** * Executes commands over SSH */ protected class RemoteShell extends Shell.ShellCommandExecutor { private String hostname; + private String user; public RemoteShell(String hostname, String[] execString, File dir, Map env, long timeout) { @@ -120,11 +135,17 @@ public class HBaseClusterManager extends Configured implements ClusterManager { this.hostname = hostname; } + public RemoteShell(String hostname, String user, String[] execString) { + super(execString); + this.hostname = hostname; + this.user = user; + } + @Override public String[] getExecString() { String at = sshUserName.isEmpty() ? "" : "@"; String remoteCmd = StringUtils.join(super.getExecString(), " "); - String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd); + String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd, user); LOG.info("Executing full command [" + cmd + "]"); return new String[] { "/usr/bin/env", "bash", "-c", cmd }; } @@ -188,13 +209,83 @@ public class HBaseClusterManager extends Configured implements ClusterManager { } } + /** + * CommandProvider to manage the service using sbin/hadoop-* scripts. + */ + static class HadoopShellCommandProvider extends CommandProvider { + private final String hadoopHome; + private final String confDir; + + HadoopShellCommandProvider(Configuration conf) throws IOException { + hadoopHome = conf.get("hbase.it.clustermanager.hadoop.home", + System.getenv("HADOOP_HOME")); + String tmp = conf.get("hbase.it.clustermanager.hadoop.conf.dir", + System.getenv("HADOOP_CONF_DIR")); + if (hadoopHome == null) { + throw new IOException("Hadoop home configuration parameter i.e. " + + "'hbase.it.clustermanager.hadoop.home' is not configured properly."); + } + if (tmp != null) { + confDir = String.format("--config %s", tmp); + } else { + confDir = ""; + } + } + + @Override + public String getCommand(ServiceType service, Operation op) { + return String.format("%s/sbin/hadoop-daemon.sh %s %s %s", hadoopHome, confDir, + op.toString().toLowerCase(), service); + } + } + + /** + * CommandProvider to manage the service using bin/zk* scripts. + */ + static class ZookeeperShellCommandProvider extends CommandProvider { + private final String zookeeperHome; + private final String confDir; + + ZookeeperShellCommandProvider(Configuration conf) throws IOException { + zookeeperHome = conf.get("hbase.it.clustermanager.zookeeper.home", + System.getenv("ZOOBINDIR")); + String tmp = conf.get("hbase.it.clustermanager.zookeeper.conf.dir", + System.getenv("ZOOCFGDIR")); + if (zookeeperHome == null) { + throw new IOException("Zookeeper home configuration parameter i.e. " + + "'hbase.it.clustermanager.zookeeper.home' is not configured properly."); + } + if (tmp != null) { + confDir = String.format("--config %s", tmp); + } else { + confDir = ""; + } + } + + @Override + public String getCommand(ServiceType service, Operation op) { + return String.format("%s/bin/zkServer.sh %s", zookeeperHome, op.toString().toLowerCase()); + } + + @Override + protected String findPidCommand(ServiceType service) { + return String.format("ps aux | grep %s | grep -v grep | tr -s ' ' | cut -d ' ' -f2", + service); + } + } + public HBaseClusterManager() { } - protected CommandProvider getCommandProvider(ServiceType service) { - //TODO: make it pluggable, or auto-detect the best command provider, should work with - //hadoop daemons as well - return new HBaseShellCommandProvider(getConf()); + protected CommandProvider getCommandProvider(ServiceType service) throws IOException { + switch (service) { + case HADOOP_DATANODE: + return new HadoopShellCommandProvider(getConf()); + case ZOOKEEPER_SERVER: + return new ZookeeperShellCommandProvider(getConf()); + default: + return new HBaseShellCommandProvider(getConf()); + } } /** @@ -202,10 +293,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager { * @return pair of exit code and command output * @throws IOException if something goes wrong. */ - private Pair exec(String hostname, String... cmd) throws IOException { + private Pair exec(String hostname, ServiceType service, String... cmd) + throws IOException { LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname); - RemoteShell shell = new RemoteShell(hostname, cmd); + RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd); try { shell.execute(); } catch (Shell.ExitCodeException ex) { @@ -222,12 +314,12 @@ public class HBaseClusterManager extends Configured implements ClusterManager { return new Pair(shell.getExitCode(), shell.getOutput()); } - private Pair execWithRetries(String hostname, String... cmd) + private Pair execWithRetries(String hostname, ServiceType service, String... cmd) throws IOException { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { - return exec(hostname, cmd); + return exec(hostname, service, cmd); } catch (IOException e) { retryOrThrow(retryCounter, e, hostname, cmd); } @@ -252,7 +344,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager { } private void exec(String hostname, ServiceType service, Operation op) throws IOException { - execWithRetries(hostname, getCommandProvider(service).getCommand(service, op)); + execWithRetries(hostname, service, getCommandProvider(service).getCommand(service, op)); } @Override @@ -271,13 +363,13 @@ public class HBaseClusterManager extends Configured implements ClusterManager { } public void signal(ServiceType service, String signal, String hostname) throws IOException { - execWithRetries(hostname, getCommandProvider(service).signalCommand(service, signal)); + execWithRetries(hostname, service, getCommandProvider(service).signalCommand(service, signal)); } @Override public boolean isRunning(ServiceType service, String hostname, int port) throws IOException { - String ret = execWithRetries(hostname, getCommandProvider(service).isRunningCommand(service)) - .getSecond(); + String ret = execWithRetries(hostname, service, + getCommandProvider(service).isRunningCommand(service)).getSecond(); return ret.length() > 0; } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java index 9ea126a0af9..717de1764be 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java @@ -321,6 +321,7 @@ public class RESTApiClusterManager extends Configured implements ClusterManager // The RoleCommand enum is used by the doRoleCommand method to guard against non-existent methods // being invoked on a given role. + // TODO: Integrate zookeeper and hdfs related failure injections (Ref: HBASE-14261). private enum RoleCommand { START, STOP, RESTART; diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index a3afccdd97b..d632ce5bb83 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -27,6 +27,7 @@ import java.util.List; import org.apache.commons.lang.math.RandomUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterStatus; import org.apache.hadoop.hbase.HBaseCluster; import org.apache.hadoop.hbase.HRegionInfo; @@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.ServerLoad; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; import org.apache.hadoop.hbase.client.Admin; -import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.util.Bytes; /** @@ -44,11 +44,19 @@ import org.apache.hadoop.hbase.util.Bytes; public class Action { public static final String KILL_MASTER_TIMEOUT_KEY = - "hbase.chaosmonkey.action.killmastertimeout"; + "hbase.chaosmonkey.action.killmastertimeout"; public static final String START_MASTER_TIMEOUT_KEY = - "hbase.chaosmonkey.action.startmastertimeout"; + "hbase.chaosmonkey.action.startmastertimeout"; public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout"; public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout"; + public static final String KILL_ZK_NODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.killzknodetimeout"; + public static final String START_ZK_NODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.startzknodetimeout"; + public static final String KILL_DATANODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.killdatanodetimeout"; + public static final String START_DATANODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.startdatanodetimeout"; protected static final Log LOG = LogFactory.getLog(Action.class); @@ -56,6 +64,10 @@ public class Action { protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long KILL_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected ActionContext context; protected HBaseCluster cluster; @@ -66,6 +78,10 @@ public class Action { protected long startMasterTimeout; protected long killRsTimeout; protected long startRsTimeout; + protected long killZkNodeTimeout; + protected long startZkNodeTimeout; + protected long killDataNodeTimeout; + protected long startDataNodeTimeout; public void init(ActionContext context) throws IOException { this.context = context; @@ -75,11 +91,19 @@ public class Action { initialServers = regionServers.toArray(new ServerName[regionServers.size()]); killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY, - KILL_MASTER_TIMEOUT_DEFAULT); + KILL_MASTER_TIMEOUT_DEFAULT); startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY, - START_MASTER_TIMEOUT_DEFAULT); + START_MASTER_TIMEOUT_DEFAULT); killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT); startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT); + killZkNodeTimeout = cluster.getConf().getLong(KILL_ZK_NODE_TIMEOUT_KEY, + KILL_ZK_NODE_TIMEOUT_DEFAULT); + startZkNodeTimeout = cluster.getConf().getLong(START_ZK_NODE_TIMEOUT_KEY, + START_ZK_NODE_TIMEOUT_DEFAULT); + killDataNodeTimeout = cluster.getConf().getLong(KILL_DATANODE_TIMEOUT_KEY, + KILL_DATANODE_TIMEOUT_DEFAULT); + startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY, + START_DATANODE_TIMEOUT_DEFAULT); } public void perform() throws Exception { } @@ -132,7 +156,37 @@ public class Action { cluster.startRegionServer(server.getHostname(), server.getPort()); cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout); LOG.info("Started region server:" + server + ". Reported num of rs:" - + cluster.getClusterStatus().getServersSize()); + + cluster.getClusterStatus().getServersSize()); + } + + protected void killZKNode(ServerName server) throws IOException { + LOG.info("Killing zookeeper node:" + server); + cluster.killZkNode(server); + cluster.waitForZkNodeToStop(server, killZkNodeTimeout); + LOG.info("Killed zookeeper node:" + server + ". Reported num of rs:" + + cluster.getClusterStatus().getServersSize()); + } + + protected void startZKNode(ServerName server) throws IOException { + LOG.info("Starting zookeeper node:" + server.getHostname()); + cluster.startZkNode(server.getHostname(), server.getPort()); + cluster.waitForZkNodeToStart(server, startZkNodeTimeout); + LOG.info("Started zookeeper node:" + server); + } + + protected void killDataNode(ServerName server) throws IOException { + LOG.info("Killing datanode:" + server); + cluster.killDataNode(server); + cluster.waitForDataNodeToStop(server, killDataNodeTimeout); + LOG.info("Killed datanode:" + server + ". Reported num of rs:" + + cluster.getClusterStatus().getServersSize()); + } + + protected void startDataNode(ServerName server) throws IOException { + LOG.info("Starting datanode:" + server.getHostname()); + cluster.startDataNode(server); + cluster.waitForDataNodeToStart(server, startDataNodeTimeout); + LOG.info("Started datanode:" + server); } protected void unbalanceRegions(ClusterStatus clusterStatus, @@ -174,6 +228,10 @@ public class Action { } } + public Configuration getConf() { + return cluster.getConf(); + } + /** * Context for Action's */ diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java index 8795352fa60..3f209da945f 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java @@ -51,4 +51,18 @@ public class RestartActionBaseAction extends Action { sleep(sleepTime); startRs(server); } + + void restartZKNode(ServerName server, long sleepTime) throws IOException { + sleepTime = Math.max(sleepTime, 1000); + killZKNode(server); + sleep(sleepTime); + startZKNode(server); + } + + void restartDataNode(ServerName server, long sleepTime) throws IOException { + sleepTime = Math.max(sleepTime, 1000); + killDataNode(server); + sleep(sleepTime); + startDataNode(server); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java new file mode 100644 index 00000000000..7299e79ed16 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hdfs.DFSClient; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +/** + * Action that restarts a random datanode. + */ +public class RestartRandomDataNodeAction extends RestartActionBaseAction { + public RestartRandomDataNodeAction(long sleepTime) { + super(sleepTime); + } + + @Override + public void perform() throws Exception { + LOG.info("Performing action: Restart random data node"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes()); + restartDataNode(server, sleepTime); + } + + public ServerName[] getDataNodes() throws IOException { + DistributedFileSystem fs = (DistributedFileSystem) FSUtils.getRootDir(getConf()) + .getFileSystem(getConf()); + DFSClient dfsClient = fs.getClient(); + List hosts = new LinkedList(); + for (DatanodeInfo dataNode: dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) { + hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1)); + } + return hosts.toArray(new ServerName[hosts.size()]); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java new file mode 100644 index 00000000000..6043acde1ba --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.apache.hadoop.hbase.zookeeper.ZKServerTool; + +/** + * Action that restarts a random zookeeper node. + */ +public class RestartRandomZKNodeAction extends RestartActionBaseAction { + public RestartRandomZKNodeAction(long sleepTime) { + super(sleepTime); + } + + @Override + public void perform() throws Exception { + LOG.info("Performing action: Restart random zookeeper node"); + ServerName server = PolicyBasedChaosMonkey.selectRandomItem( + ZKServerTool.readZKNodes(getConf())); + restartZKNode(server, sleepTime); + } +} \ No newline at end of file diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java index 2f652512eb7..0cf33184c5e 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java @@ -70,12 +70,14 @@ public abstract class MonkeyFactory { public static final String STRESS_AM = "stressAM"; public static final String NO_KILL = "noKill"; public static final String MASTER_KILLING = "masterKilling"; + public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling"; public static Map FACTORIES = ImmutableMap.builder() .put(CALM, new CalmMonkeyFactory()) .put(SLOW_DETERMINISTIC, new SlowDeterministicMonkeyFactory()) .put(UNBALANCE, new UnbalanceMonkeyFactory()) .put(SERVER_KILLING, new ServerKillingMonkeyFactory()) + .put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory()) .put(STRESS_AM, new StressAssignmentManagerMonkeyFactory()) .put(NO_KILL, new NoKillMonkeyFactory()) .put(MASTER_KILLING, new MasterKillingMonkeyFactory()) diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java new file mode 100644 index 00000000000..4faa786bcd2 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.factories; + +import org.apache.hadoop.hbase.chaos.actions.Action; +import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; +import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction; +import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction; +import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction; +import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction; +import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey; +import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; +import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy; +import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy; +import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy; + +/** + * Creates ChaosMonkeys for doing server restart actions, but not + * flush / compact / snapshot kind of actions. + */ +public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory { + + @Override + public ChaosMonkey build() { + + // Destructive actions to mess things around. Cannot run batch restart. + Action[] actions1 = new Action[]{ + new RestartRandomRsExceptMetaAction(60000), + new RestartActiveMasterAction(5000), + new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead. + new ForceBalancerAction(), + new RestartRandomDataNodeAction(60000), + new RestartRandomZKNodeAction(60000) + }; + + // Action to log more info for debugging + Action[] actions2 = new Action[]{ + new DumpClusterStatusAction() + }; + + return new PolicyBasedChaosMonkey(util, + new CompositeSequentialPolicy( + new DoActionsOncePolicy(60 * 1000, actions1), + new PeriodicRandomActionPolicy(60 * 1000, actions1)), + new PeriodicRandomActionPolicy(60 * 1000, actions2)); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java index 73483da70aa..f16c020e423 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java @@ -19,9 +19,13 @@ package org.apache.hadoop.hbase.zookeeper; +import java.util.LinkedList; +import java.util.List; import java.util.Properties; import java.util.Map.Entry; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; @@ -33,12 +37,10 @@ import org.apache.hadoop.hbase.HBaseInterfaceAudience; */ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS) public class ZKServerTool { - /** - * Run the tool. - * @param args Command line arguments. - */ - public static void main(String args[]) { - Configuration conf = HBaseConfiguration.create(); + + public static ServerName[] readZKNodes(Configuration conf) { + List hosts = new LinkedList(); + // Note that we do not simply grab the property // HConstants.ZOOKEEPER_QUORUM from the HBaseConfiguration because the // user may be using a zoo.cfg file. @@ -49,8 +51,24 @@ public class ZKServerTool { if (key.startsWith("server.")) { String[] parts = value.split(":"); String host = parts[0]; - System.out.println("ZK host:" + host); + + int port = HConstants.DEFAULT_ZOOKEPER_CLIENT_PORT; + if (parts.length > 1) { + port = Integer.parseInt(parts[1]); + } + hosts.add(ServerName.valueOf(host, port, -1)); } } + return hosts.toArray(new ServerName[hosts.size()]); + } + + /** + * Run the tool. + * @param args Command line arguments. + */ + public static void main(String args[]) { + for(ServerName server: readZKNodes(HBaseConfiguration.create())) { + System.out.println("Zk host: " + server.getHostname()); + } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index e6f181be75b..ce1da03947e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -163,6 +163,81 @@ public abstract class HBaseCluster implements Closeable, Configurable { public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException; + /** + * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, + * silently logs warning message. + * @param hostname the hostname to start the regionserver on + * @throws IOException if something goes wrong + */ + public abstract void startZkNode(String hostname, int port) throws IOException; + + /** + * Kills the zookeeper node process if this is a distributed cluster, otherwise, + * this causes master to exit doing basic clean up only. + * @throws IOException if something goes wrong + */ + public abstract void killZkNode(ServerName serverName) throws IOException; + + /** + * Stops the region zookeeper if this is a distributed cluster, otherwise + * silently logs warning message. + * @throws IOException if something goes wrong + */ + public abstract void stopZkNode(ServerName serverName) throws IOException; + + /** + * Wait for the specified zookeeper node to join the cluster + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) + throws IOException; + + /** + * Wait for the specified zookeeper node to stop the thread / process. + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) + throws IOException; + + /** + * Starts a new datanode on the given hostname or if this is a mini/local cluster, + * silently logs warning message. + * @throws IOException if something goes wrong + */ + public abstract void startDataNode(ServerName serverName) throws IOException; + + /** + * Kills the datanode process if this is a distributed cluster, otherwise, + * this causes master to exit doing basic clean up only. + * @throws IOException if something goes wrong + */ + public abstract void killDataNode(ServerName serverName) throws IOException; + + /** + * Stops the datanode if this is a distributed cluster, otherwise + * silently logs warning message. + * @throws IOException if something goes wrong + */ + public abstract void stopDataNode(ServerName serverName) throws IOException; + + /** + * Wait for the specified datanode to join the cluster + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForDataNodeToStart(ServerName serverName, long timeout) + throws IOException; + + /** + * Wait for the specified datanode to stop the thread / process. + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForDataNodeToStop(ServerName serverName, long timeout) + throws IOException; + /** * Starts a new master on the given hostname or if this is a mini/local cluster, * starts a master locally. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 38d92d33977..c2d273bbac9 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -260,6 +260,56 @@ public class MiniHBaseCluster extends HBaseCluster { waitOnRegionServer(getRegionServerIndex(serverName)); } + @Override + public void startZkNode(String hostname, int port) throws IOException { + LOG.warn("Starting zookeeper nodes on mini cluster is not supported"); + } + + @Override + public void killZkNode(ServerName serverName) throws IOException { + LOG.warn("Aborting zookeeper nodes on mini cluster is not supported"); + } + + @Override + public void stopZkNode(ServerName serverName) throws IOException { + LOG.warn("Stopping zookeeper nodes on mini cluster is not supported"); + } + + @Override + public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for zookeeper nodes to start on mini cluster is not supported"); + } + + @Override + public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for zookeeper nodes to stop on mini cluster is not supported"); + } + + @Override + public void startDataNode(ServerName serverName) throws IOException { + LOG.warn("Starting datanodes on mini cluster is not supported"); + } + + @Override + public void killDataNode(ServerName serverName) throws IOException { + LOG.warn("Aborting datanodes on mini cluster is not supported"); + } + + @Override + public void stopDataNode(ServerName serverName) throws IOException { + LOG.warn("Stopping datanodes on mini cluster is not supported"); + } + + @Override + public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for datanodes to start on mini cluster is not supported"); + } + + @Override + public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for datanodes to stop on mini cluster is not supported"); + } + @Override public void startMaster(String hostname, int port) throws IOException { this.startMaster();