From b3e41c9525f0f8537b87bb7bf923cf74c31ee585 Mon Sep 17 00:00:00 2001 From: Monani Mihir Date: Tue, 31 Jul 2018 18:44:45 +0530 Subject: [PATCH] HBASE-19036 Add action in Chaos Monkey to restart Active Namenode Signed-off-by: tedyu --- .../hadoop/hbase/DistributedHBaseCluster.java | 33 ++++++- .../hadoop/hbase/HBaseClusterManager.java | 2 + .../hadoop/hbase/chaos/actions/Action.java | 28 ++++++ .../actions/RestartActionBaseAction.java | 12 +++ .../actions/RestartActiveNameNodeAction.java | 90 +++++++++++++++++++ .../org/apache/hadoop/hbase/HBaseCluster.java | 37 ++++++++ .../apache/hadoop/hbase/MiniHBaseCluster.java | 26 ++++++ 7 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index 943f2a665ef..5ec9e254677 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Objects; import java.util.Set; import java.util.TreeSet; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterManager.ServiceType; import org.apache.hadoop.hbase.client.Admin; @@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionLocator; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Threads; import org.apache.yetus.audience.InterfaceAudience; - import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo; @@ -204,6 +204,37 @@ public class DistributedHBaseCluster extends HBaseCluster { waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout); } + @Override + public void startNameNode(ServerName serverName) throws IOException { + LOG.info("Starting name node on: " + serverName.getServerName()); + clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), + serverName.getPort()); + } + + @Override + public void killNameNode(ServerName serverName) throws IOException { + LOG.info("Aborting name node on: " + serverName.getServerName()); + clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), + serverName.getPort()); + } + + @Override + public void stopNameNode(ServerName serverName) throws IOException { + LOG.info("Stopping name node on: " + serverName.getServerName()); + clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(), + serverName.getPort()); + } + + @Override + public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException { + waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout); + } + + @Override + public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException { + waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout); + } + private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) throws IOException { LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java index 884ddadd1ae..f7c2fc65280 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java @@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager { Configuration conf = getConf(); switch (service) { case HADOOP_DATANODE: + case HADOOP_NAMENODE: return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs"); case ZOOKEEPER_SERVER: return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper"); @@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager { protected CommandProvider getCommandProvider(ServiceType service) throws IOException { switch (service) { case HADOOP_DATANODE: + case HADOOP_NAMENODE: return new HadoopShellCommandProvider(getConf()); case ZOOKEEPER_SERVER: return new ZookeeperShellCommandProvider(getConf()); diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index 2b2c1b8e121..350e18a4544 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -26,6 +26,7 @@ import java.util.List; import java.util.Map; import java.util.function.BiConsumer; import java.util.function.Consumer; + import org.apache.commons.lang3.RandomUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterMetrics; @@ -65,6 +66,10 @@ public class Action { "hbase.chaosmonkey.action.killdatanodetimeout"; public static final String START_DATANODE_TIMEOUT_KEY = "hbase.chaosmonkey.action.startdatanodetimeout"; + public static final String KILL_NAMENODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.killnamenodetimeout"; + public static final String START_NAMENODE_TIMEOUT_KEY = + "hbase.chaosmonkey.action.startnamenodetimeout"; protected static final Logger LOG = LoggerFactory.getLogger(Action.class); @@ -76,6 +81,8 @@ public class Action { protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected ActionContext context; protected HBaseCluster cluster; @@ -90,6 +97,8 @@ public class Action { protected long startZkNodeTimeout; protected long killDataNodeTimeout; protected long startDataNodeTimeout; + protected long killNameNodeTimeout; + protected long startNameNodeTimeout; public void init(ActionContext context) throws IOException { this.context = context; @@ -112,6 +121,11 @@ public class Action { KILL_DATANODE_TIMEOUT_DEFAULT); startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY, START_DATANODE_TIMEOUT_DEFAULT); + killNameNodeTimeout = + cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT); + startNameNodeTimeout = + cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT); + } public void perform() throws Exception { } @@ -197,6 +211,20 @@ public class Action { LOG.info("Started datanode " + server); } + protected void killNameNode(ServerName server) throws IOException { + LOG.info("Killing namenode :-" + server.getHostname()); + cluster.killNameNode(server); + cluster.waitForNameNodeToStop(server, killNameNodeTimeout); + LOG.info("Killed namenode:" + server + ". Reported num of rs:" + + cluster.getClusterMetrics().getLiveServerMetrics().size()); + } + + protected void startNameNode(ServerName server) throws IOException { + LOG.info("Starting Namenode :-" + server.getHostname()); + cluster.startNameNode(server); + cluster.waitForNameNodeToStart(server, startNameNodeTimeout); + LOG.info("Started namenode:" + server); + } protected void unbalanceRegions(ClusterMetrics clusterStatus, List fromServers, List toServers, double fractionOfRegions) throws Exception { diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java index 63286cb1f9a..6e589aeaa2d 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java @@ -82,4 +82,16 @@ public class RestartActionBaseAction extends Action { sleep(sleepTime); startDataNode(server); } + + void restartNameNode(ServerName server, long sleepTime) throws IOException { + sleepTime = Math.max(sleepTime, 1000); + // Don't try the kill if we're stopping + if (context.isStopping()) { + return; + } + killNameNode(server); + sleep(sleepTime); + startNameNode(server); + } + } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java new file mode 100644 index 00000000000..645743a7faf --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo; + +/** + * Action that tries to restart the active namenode. + */ +public class RestartActiveNameNodeAction extends RestartActionBaseAction { + + // Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME + private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock"; + + // Value taken from org.apache.hadoop.ha.ZKFailoverController.java + // variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY + private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha"; + private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode"; + + public RestartActiveNameNodeAction(long sleepTime) { + super(sleepTime); + } + + @Override + public void perform() throws Exception { + LOG.info("Performing action: Restart active namenode"); + Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf(); + String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf); + if (!HAUtil.isHAEnabled(conf, nameServiceID)) { + throw new Exception("HA for namenode is not enabled"); + } + ZKWatcher zkw = null; + RecoverableZooKeeper rzk = null; + String activeNamenode = null; + String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT); + try { + zkw = new ZKWatcher(conf, "get-active-namenode", null); + rzk = zkw.getRecoverableZooKeeper(); + String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID); + List subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath); + for (String eachEntry : subChildern) { + if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) { + byte[] data = + rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, + null); + ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data); + activeNamenode = proto.getHostname(); + } + } + } finally { + if (zkw != null) { + zkw.close(); + } + } + if (activeNamenode == null) { + throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode); + } + LOG.info("Found active namenode host:" + activeNamenode); + ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1); + LOG.info("Restarting Active NameNode :" + activeNamenode); + restartNameNode(activeNNHost, sleepTime); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index 59a00591614..a1b474deb1e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase; import java.io.Closeable; import java.io.IOException; + import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.util.Threads; @@ -247,6 +248,42 @@ public abstract class HBaseCluster implements Closeable, Configurable { public abstract void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException; + /** + * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs + * warning message. + * @throws IOException if something goes wrong + */ + public abstract void startNameNode(ServerName serverName) throws IOException; + + /** + * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to + * exit doing basic clean up only. + * @throws IOException if something goes wrong + */ + public abstract void killNameNode(ServerName serverName) throws IOException; + + /** + * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message. + * @throws IOException if something goes wrong + */ + public abstract void stopNameNode(ServerName serverName) throws IOException; + + /** + * Wait for the specified namenode to join the cluster + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForNameNodeToStart(ServerName serverName, long timeout) + throws IOException; + + /** + * Wait for the specified namenode to stop + * @return whether the operation finished with success + * @throws IOException if something goes wrong or timeout occurs + */ + public abstract void waitForNameNodeToStop(ServerName serverName, long timeout) + throws IOException; + /** * Starts a new master on the given hostname or if this is a mini/local cluster, * starts a master locally. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 5eb72188e90..473eb74abf3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hbase.master.HMaster; @@ -348,6 +349,31 @@ public class MiniHBaseCluster extends HBaseCluster { LOG.warn("Waiting for datanodes to stop on mini cluster is not supported"); } + @Override + public void startNameNode(ServerName serverName) throws IOException { + LOG.warn("Starting namenodes on mini cluster is not supported"); + } + + @Override + public void killNameNode(ServerName serverName) throws IOException { + LOG.warn("Aborting namenodes on mini cluster is not supported"); + } + + @Override + public void stopNameNode(ServerName serverName) throws IOException { + LOG.warn("Stopping namenodes on mini cluster is not supported"); + } + + @Override + public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for namenodes to start on mini cluster is not supported"); + } + + @Override + public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException { + LOG.warn("Waiting for namenodes to stop on mini cluster is not supported"); + } + @Override public void startMaster(String hostname, int port) throws IOException { this.startMaster();