diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index ce9ca70efac..b477f76d133 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -190,6 +190,37 @@ public class DistributedHBaseCluster extends HBaseCluster { waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout); } + @Override + public void startNameNode(ServerName serverName) throws IOException { + LOG.info("Starting name node on: " + serverName.getServerName()); + clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), + serverName.getPort()); + } + + @Override + public void killNameNode(ServerName serverName) throws IOException { + LOG.info("Aborting name node on: " + serverName.getServerName()); + clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), + serverName.getPort()); + } + + @Override + public void stopNameNode(ServerName serverName) throws IOException { + LOG.info("Stopping name node on: " + serverName.getServerName()); + clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), + serverName.getPort()); + } + + @Override + public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException { + waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout); + } + + @Override + public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException { + waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout); + } + private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) throws IOException { LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java index a3cd73bdf77..509940a4dd4 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java @@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager { Configuration conf = getConf(); switch (service) { case HADOOP_DATANODE: + case HADOOP_NAMENODE: return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs"); case ZOOKEEPER_SERVER: return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper"); @@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager { protected CommandProvider getCommandProvider(ServiceType service) throws IOException { switch (service) { case HADOOP_DATANODE: + case HADOOP_NAMENODE: return new HadoopShellCommandProvider(getConf()); case ZOOKEEPER_SERVER: return new ZookeeperShellCommandProvider(getConf()); diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index fe140e29a76..9ceec69e5b3 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -57,6 +57,10 @@ public class Action { "hbase.chaosmonkey.action.killdatanodetimeout"; public static final String START_DATANODE_TIMEOUT_KEY = "hbase.chaosmonkey.action.startdatanodetimeout"; + public static final String KILL_NAMENODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.killnamenodetimeout"; + public static final String START_NAMENODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.startnamenodetimeout"; protected static final Log LOG = LogFactory.getLog(Action.class); @@ -68,6 +72,8 @@ public class Action { protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected ActionContext context; protected HBaseCluster cluster; @@ -82,6 +88,8 @@ public class Action { protected long startZkNodeTimeout; protected long killDataNodeTimeout; protected long startDataNodeTimeout; + protected long killNameNodeTimeout; + protected long startNameNodeTimeout; public void init(ActionContext context) throws IOException { this.context = context; @@ -104,6 +112,10 @@ public class Action { KILL_DATANODE_TIMEOUT_DEFAULT); startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY, START_DATANODE_TIMEOUT_DEFAULT); + killNameNodeTimeout = + cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT); + startNameNodeTimeout = + cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT); } public void perform() throws Exception { } @@ -189,6 +201,21 @@ public class Action { LOG.info("Started datanode:" + server); } + protected void killNameNode(ServerName server) throws IOException { + LOG.info("Killing namenode :-" + server.getHostname()); + cluster.killNameNode(server); + cluster.waitForNameNodeToStop(server, killNameNodeTimeout); + LOG.info("Killed namenode:" + server + ". Reported num of rs:" + + cluster.getClusterStatus().getServersSize()); + } + + protected void startNameNode(ServerName server) throws IOException { + LOG.info("Starting Namenode :-" + server.getHostname()); + cluster.startNameNode(server); + cluster.waitForNameNodeToStart(server, startNameNodeTimeout); + LOG.info("Started namenode:" + server); + } + protected void unbalanceRegions(ClusterStatus clusterStatus, List fromServers, List toServers, double fractionOfRegions) throws Exception { diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java index 63286cb1f9a..22d7e2618e3 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java @@ -82,4 +82,15 @@ public class RestartActionBaseAction extends Action { sleep(sleepTime); startDataNode(server); } + + void restartNameNode(ServerName server, long sleepTime) throws IOException { + sleepTime = Math.max(sleepTime, 1000); + // Don't try the kill if we're stopping + if (context.isStopping()) { + return; + } + killNameNode(server); + sleep(sleepTime); + startNameNode(server); + } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java new file mode 100644 index 00000000000..710ac147786 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo; + +/** + * Action that tries to restart the active namenode. + */ +public class RestartActiveNameNodeAction extends RestartActionBaseAction { + + // Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME + private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock"; + + // Value taken from org.apache.hadoop.ha.ZKFailoverController.java + // variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY + private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha"; + private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode"; + + public RestartActiveNameNodeAction(long sleepTime) { + super(sleepTime); + } + + @Override + public void perform() throws Exception { + LOG.info("Performing action: Restart active namenode"); + Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf(); + String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf); + if (!HAUtil.isHAEnabled(conf, nameServiceID)) { + throw new Exception("HA for namenode is not enabled"); + } + ZooKeeperWatcher zkw = null; + RecoverableZooKeeper rzk = null; + String activeNamenode = null; + String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT); + try { + zkw = new ZooKeeperWatcher(conf, "get-active-namenode", null); + rzk = zkw.getRecoverableZooKeeper(); + String hadoopHAZkNodePath = ZKUtil.joinZNode(hadoopHAZkNode, nameServiceID); + List subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath); + for (String eachEntry : subChildern) { + if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) { + byte[] data = + rzk.getData(ZKUtil.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, + null); + ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data); + activeNamenode = proto.getHostname(); + } + } + } finally { + if (zkw != null) { + zkw.close(); + } + } + if (activeNamenode == null) { + throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode); + } + LOG.info("Found active namenode host:" + activeNamenode); + ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1); + LOG.info("Restarting Active NameNode :" + activeNamenode); + restartNameNode(activeNNHost, sleepTime); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index ce1da03947e..4432811363a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -239,8 +239,44 @@ public abstract class HBaseCluster implements Closeable, Configurable { throws IOException; /** - * Starts a new master on the given hostname or if this is a mini/local cluster, - * starts a master locally. + * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs + * warning message. + * @throws IOException if something goes wrong + */ + public abstract void startNameNode(ServerName serverName) throws IOException; + + /** + * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to + * exit doing basic clean up only. + * @throws IOException if something goes wrong + */ + public abstract void killNameNode(ServerName serverName) throws IOException; + + /** + * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message. + * @throws IOException if something goes wrong + */ + public abstract void stopNameNode(ServerName serverName) throws IOException; + + /** + * Wait for the specified namenode to join the cluster + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForNameNodeToStart(ServerName serverName, long timeout) + throws IOException; + + /** + * Wait for the specified namenode to stop + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForNameNodeToStop(ServerName serverName, long timeout) + throws IOException; + + /** + * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master + * locally. * @param hostname the hostname to start the master on * @return whether the operation finished with success * @throws IOException if something goes wrong diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 299e7b98721..7792be37af2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -314,6 +314,31 @@ public class MiniHBaseCluster extends HBaseCluster { LOG.warn("Waiting for datanodes to stop on mini cluster is not supported"); } + @Override + public void startNameNode(ServerName serverName) throws IOException { + LOG.warn("Starting namenodes on mini cluster is not supported"); + } + + @Override + public void killNameNode(ServerName serverName) throws IOException { + LOG.warn("Aborting namenodes on mini cluster is not supported"); + } + + @Override + public void stopNameNode(ServerName serverName) throws IOException { + LOG.warn("Stopping namenodes on mini cluster is not supported"); + } + + @Override + public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for namenodes to start on mini cluster is not supported"); + } + + @Override + public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for namenodes to stop on mini cluster is not supported"); + } + @Override public void startMaster(String hostname, int port) throws IOException { this.startMaster();