HBASE-19036 Add action in Chaos Monkey to restart Active Namenode

Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
Monani Mihir 2018-07-31 18:40:24 +05:30 committed by tedyu
parent a15c445743
commit 0298c06b4f
7 changed files with 223 additions and 2 deletions

View File

@ -190,6 +190,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout); waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
} }
@Override
public void startNameNode(ServerName serverName) throws IOException {
LOG.info("Starting name node on: " + serverName.getServerName());
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void killNameNode(ServerName serverName) throws IOException {
LOG.info("Aborting name node on: " + serverName.getServerName());
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void stopNameNode(ServerName serverName) throws IOException {
LOG.info("Stopping name node on: " + serverName.getServerName());
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
}
@Override
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
}
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
throws IOException { throws IOException {
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());

View File

@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
Configuration conf = getConf(); Configuration conf = getConf();
switch (service) { switch (service) {
case HADOOP_DATANODE: case HADOOP_DATANODE:
case HADOOP_NAMENODE:
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs"); return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
case ZOOKEEPER_SERVER: case ZOOKEEPER_SERVER:
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper"); return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
protected CommandProvider getCommandProvider(ServiceType service) throws IOException { protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
switch (service) { switch (service) {
case HADOOP_DATANODE: case HADOOP_DATANODE:
case HADOOP_NAMENODE:
return new HadoopShellCommandProvider(getConf()); return new HadoopShellCommandProvider(getConf());
case ZOOKEEPER_SERVER: case ZOOKEEPER_SERVER:
return new ZookeeperShellCommandProvider(getConf()); return new ZookeeperShellCommandProvider(getConf());

View File

@ -57,6 +57,10 @@ public class Action {
"hbase.chaosmonkey.action.killdatanodetimeout"; "hbase.chaosmonkey.action.killdatanodetimeout";
public static final String START_DATANODE_TIMEOUT_KEY = public static final String START_DATANODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startdatanodetimeout"; "hbase.chaosmonkey.action.startdatanodetimeout";
public static final String KILL_NAMENODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.killnamenodetimeout";
public static final String START_NAMENODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startnamenodetimeout";
protected static final Log LOG = LogFactory.getLog(Action.class); protected static final Log LOG = LogFactory.getLog(Action.class);
@ -68,6 +72,8 @@ public class Action {
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected ActionContext context; protected ActionContext context;
protected HBaseCluster cluster; protected HBaseCluster cluster;
@ -82,6 +88,8 @@ public class Action {
protected long startZkNodeTimeout; protected long startZkNodeTimeout;
protected long killDataNodeTimeout; protected long killDataNodeTimeout;
protected long startDataNodeTimeout; protected long startDataNodeTimeout;
protected long killNameNodeTimeout;
protected long startNameNodeTimeout;
public void init(ActionContext context) throws IOException { public void init(ActionContext context) throws IOException {
this.context = context; this.context = context;
@ -104,6 +112,10 @@ public class Action {
KILL_DATANODE_TIMEOUT_DEFAULT); KILL_DATANODE_TIMEOUT_DEFAULT);
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY, startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
START_DATANODE_TIMEOUT_DEFAULT); START_DATANODE_TIMEOUT_DEFAULT);
killNameNodeTimeout =
cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
startNameNodeTimeout =
cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
} }
public void perform() throws Exception { } public void perform() throws Exception { }
@ -189,6 +201,21 @@ public class Action {
LOG.info("Started datanode:" + server); LOG.info("Started datanode:" + server);
} }
protected void killNameNode(ServerName server) throws IOException {
LOG.info("Killing namenode :-" + server.getHostname());
cluster.killNameNode(server);
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
+ cluster.getClusterStatus().getServersSize());
}
protected void startNameNode(ServerName server) throws IOException {
LOG.info("Starting Namenode :-" + server.getHostname());
cluster.startNameNode(server);
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
LOG.info("Started namenode:" + server);
}
protected void unbalanceRegions(ClusterStatus clusterStatus, protected void unbalanceRegions(ClusterStatus clusterStatus,
List<ServerName> fromServers, List<ServerName> toServers, List<ServerName> fromServers, List<ServerName> toServers,
double fractionOfRegions) throws Exception { double fractionOfRegions) throws Exception {

View File

@ -82,4 +82,15 @@ public class RestartActionBaseAction extends Action {
sleep(sleepTime); sleep(sleepTime);
startDataNode(server); startDataNode(server);
} }
void restartNameNode(ServerName server, long sleepTime) throws IOException {
sleepTime = Math.max(sleepTime, 1000);
// Don't try the kill if we're stopping
if (context.isStopping()) {
return;
}
killNameNode(server);
sleep(sleepTime);
startNameNode(server);
}
} }

View File

@ -0,0 +1,89 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
/**
* Action that tries to restart the active namenode.
*/
public class RestartActiveNameNodeAction extends RestartActionBaseAction {
// Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
// Value taken from org.apache.hadoop.ha.ZKFailoverController.java
// variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
public RestartActiveNameNodeAction(long sleepTime) {
super(sleepTime);
}
@Override
public void perform() throws Exception {
LOG.info("Performing action: Restart active namenode");
Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
throw new Exception("HA for namenode is not enabled");
}
ZooKeeperWatcher zkw = null;
RecoverableZooKeeper rzk = null;
String activeNamenode = null;
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
try {
zkw = new ZooKeeperWatcher(conf, "get-active-namenode", null);
rzk = zkw.getRecoverableZooKeeper();
String hadoopHAZkNodePath = ZKUtil.joinZNode(hadoopHAZkNode, nameServiceID);
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
for (String eachEntry : subChildern) {
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
byte[] data =
rzk.getData(ZKUtil.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
null);
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
activeNamenode = proto.getHostname();
}
}
} finally {
if (zkw != null) {
zkw.close();
}
}
if (activeNamenode == null) {
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
}
LOG.info("Found active namenode host:" + activeNamenode);
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
LOG.info("Restarting Active NameNode :" + activeNamenode);
restartNameNode(activeNNHost, sleepTime);
}
}

View File

@ -239,8 +239,44 @@ public abstract class HBaseCluster implements Closeable, Configurable {
throws IOException; throws IOException;
/** /**
* Starts a new master on the given hostname or if this is a mini/local cluster, * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
* starts a master locally. * warning message.
* @throws IOException if something goes wrong
*/
public abstract void startNameNode(ServerName serverName) throws IOException;
/**
* Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
* exit doing basic clean up only.
* @throws IOException if something goes wrong
*/
public abstract void killNameNode(ServerName serverName) throws IOException;
/**
* Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
* @throws IOException if something goes wrong
*/
public abstract void stopNameNode(ServerName serverName) throws IOException;
/**
* Wait for the specified namenode to join the cluster
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
throws IOException;
/**
* Wait for the specified namenode to stop
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
throws IOException;
/**
* Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
* locally.
* @param hostname the hostname to start the master on * @param hostname the hostname to start the master on
* @return whether the operation finished with success * @return whether the operation finished with success
* @throws IOException if something goes wrong * @throws IOException if something goes wrong

View File

@ -314,6 +314,31 @@ public class MiniHBaseCluster extends HBaseCluster {
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported"); LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
} }
@Override
public void startNameNode(ServerName serverName) throws IOException {
LOG.warn("Starting namenodes on mini cluster is not supported");
}
@Override
public void killNameNode(ServerName serverName) throws IOException {
LOG.warn("Aborting namenodes on mini cluster is not supported");
}
@Override
public void stopNameNode(ServerName serverName) throws IOException {
LOG.warn("Stopping namenodes on mini cluster is not supported");
}
@Override
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
}
@Override
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
}
@Override @Override
public void startMaster(String hostname, int port) throws IOException { public void startMaster(String hostname, int port) throws IOException {
this.startMaster(); this.startMaster();