HBASE-19036 Add action in Chaos Monkey to restart Active Namenode
Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
parent
a15c445743
commit
0298c06b4f
|
@ -190,6 +190,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
||||||
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
|
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startNameNode(ServerName serverName) throws IOException {
|
||||||
|
LOG.info("Starting name node on: " + serverName.getServerName());
|
||||||
|
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||||
|
serverName.getPort());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void killNameNode(ServerName serverName) throws IOException {
|
||||||
|
LOG.info("Aborting name node on: " + serverName.getServerName());
|
||||||
|
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||||
|
serverName.getPort());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void stopNameNode(ServerName serverName) throws IOException {
|
||||||
|
LOG.info("Stopping name node on: " + serverName.getServerName());
|
||||||
|
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||||
|
serverName.getPort());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||||
|
waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||||
|
waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
|
||||||
|
}
|
||||||
|
|
||||||
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
|
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
|
||||||
|
|
|
@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
||||||
Configuration conf = getConf();
|
Configuration conf = getConf();
|
||||||
switch (service) {
|
switch (service) {
|
||||||
case HADOOP_DATANODE:
|
case HADOOP_DATANODE:
|
||||||
|
case HADOOP_NAMENODE:
|
||||||
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
|
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
|
||||||
case ZOOKEEPER_SERVER:
|
case ZOOKEEPER_SERVER:
|
||||||
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
|
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
|
||||||
|
@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
||||||
protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
|
protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
|
||||||
switch (service) {
|
switch (service) {
|
||||||
case HADOOP_DATANODE:
|
case HADOOP_DATANODE:
|
||||||
|
case HADOOP_NAMENODE:
|
||||||
return new HadoopShellCommandProvider(getConf());
|
return new HadoopShellCommandProvider(getConf());
|
||||||
case ZOOKEEPER_SERVER:
|
case ZOOKEEPER_SERVER:
|
||||||
return new ZookeeperShellCommandProvider(getConf());
|
return new ZookeeperShellCommandProvider(getConf());
|
||||||
|
|
|
@ -57,6 +57,10 @@ public class Action {
|
||||||
"hbase.chaosmonkey.action.killdatanodetimeout";
|
"hbase.chaosmonkey.action.killdatanodetimeout";
|
||||||
public static final String START_DATANODE_TIMEOUT_KEY =
|
public static final String START_DATANODE_TIMEOUT_KEY =
|
||||||
"hbase.chaosmonkey.action.startdatanodetimeout";
|
"hbase.chaosmonkey.action.startdatanodetimeout";
|
||||||
|
public static final String KILL_NAMENODE_TIMEOUT_KEY =
|
||||||
|
"hbase.chaosmonkey.action.killnamenodetimeout";
|
||||||
|
public static final String START_NAMENODE_TIMEOUT_KEY =
|
||||||
|
"hbase.chaosmonkey.action.startnamenodetimeout";
|
||||||
|
|
||||||
protected static final Log LOG = LogFactory.getLog(Action.class);
|
protected static final Log LOG = LogFactory.getLog(Action.class);
|
||||||
|
|
||||||
|
@ -68,6 +72,8 @@ public class Action {
|
||||||
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||||
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||||
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||||
|
protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||||
|
protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||||
|
|
||||||
protected ActionContext context;
|
protected ActionContext context;
|
||||||
protected HBaseCluster cluster;
|
protected HBaseCluster cluster;
|
||||||
|
@ -82,6 +88,8 @@ public class Action {
|
||||||
protected long startZkNodeTimeout;
|
protected long startZkNodeTimeout;
|
||||||
protected long killDataNodeTimeout;
|
protected long killDataNodeTimeout;
|
||||||
protected long startDataNodeTimeout;
|
protected long startDataNodeTimeout;
|
||||||
|
protected long killNameNodeTimeout;
|
||||||
|
protected long startNameNodeTimeout;
|
||||||
|
|
||||||
public void init(ActionContext context) throws IOException {
|
public void init(ActionContext context) throws IOException {
|
||||||
this.context = context;
|
this.context = context;
|
||||||
|
@ -104,6 +112,10 @@ public class Action {
|
||||||
KILL_DATANODE_TIMEOUT_DEFAULT);
|
KILL_DATANODE_TIMEOUT_DEFAULT);
|
||||||
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
|
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
|
||||||
START_DATANODE_TIMEOUT_DEFAULT);
|
START_DATANODE_TIMEOUT_DEFAULT);
|
||||||
|
killNameNodeTimeout =
|
||||||
|
cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
|
||||||
|
startNameNodeTimeout =
|
||||||
|
cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void perform() throws Exception { }
|
public void perform() throws Exception { }
|
||||||
|
@ -189,6 +201,21 @@ public class Action {
|
||||||
LOG.info("Started datanode:" + server);
|
LOG.info("Started datanode:" + server);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void killNameNode(ServerName server) throws IOException {
|
||||||
|
LOG.info("Killing namenode :-" + server.getHostname());
|
||||||
|
cluster.killNameNode(server);
|
||||||
|
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
|
||||||
|
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
|
||||||
|
+ cluster.getClusterStatus().getServersSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void startNameNode(ServerName server) throws IOException {
|
||||||
|
LOG.info("Starting Namenode :-" + server.getHostname());
|
||||||
|
cluster.startNameNode(server);
|
||||||
|
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
|
||||||
|
LOG.info("Started namenode:" + server);
|
||||||
|
}
|
||||||
|
|
||||||
protected void unbalanceRegions(ClusterStatus clusterStatus,
|
protected void unbalanceRegions(ClusterStatus clusterStatus,
|
||||||
List<ServerName> fromServers, List<ServerName> toServers,
|
List<ServerName> fromServers, List<ServerName> toServers,
|
||||||
double fractionOfRegions) throws Exception {
|
double fractionOfRegions) throws Exception {
|
||||||
|
|
|
@ -82,4 +82,15 @@ public class RestartActionBaseAction extends Action {
|
||||||
sleep(sleepTime);
|
sleep(sleepTime);
|
||||||
startDataNode(server);
|
startDataNode(server);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void restartNameNode(ServerName server, long sleepTime) throws IOException {
|
||||||
|
sleepTime = Math.max(sleepTime, 1000);
|
||||||
|
// Don't try the kill if we're stopping
|
||||||
|
if (context.isStopping()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
killNameNode(server);
|
||||||
|
sleep(sleepTime);
|
||||||
|
startNameNode(server);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
import org.apache.hadoop.hbase.util.FSUtils;
|
||||||
|
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
|
||||||
|
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||||
|
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||||
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
|
import org.apache.hadoop.hdfs.HAUtil;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Action that tries to restart the active namenode.
|
||||||
|
*/
|
||||||
|
public class RestartActiveNameNodeAction extends RestartActionBaseAction {
|
||||||
|
|
||||||
|
// Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
|
||||||
|
private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
|
||||||
|
|
||||||
|
// Value taken from org.apache.hadoop.ha.ZKFailoverController.java
|
||||||
|
// variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
|
||||||
|
private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
|
||||||
|
private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
|
||||||
|
|
||||||
|
public RestartActiveNameNodeAction(long sleepTime) {
|
||||||
|
super(sleepTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void perform() throws Exception {
|
||||||
|
LOG.info("Performing action: Restart active namenode");
|
||||||
|
Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
|
||||||
|
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
|
||||||
|
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
|
||||||
|
throw new Exception("HA for namenode is not enabled");
|
||||||
|
}
|
||||||
|
ZooKeeperWatcher zkw = null;
|
||||||
|
RecoverableZooKeeper rzk = null;
|
||||||
|
String activeNamenode = null;
|
||||||
|
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
|
||||||
|
try {
|
||||||
|
zkw = new ZooKeeperWatcher(conf, "get-active-namenode", null);
|
||||||
|
rzk = zkw.getRecoverableZooKeeper();
|
||||||
|
String hadoopHAZkNodePath = ZKUtil.joinZNode(hadoopHAZkNode, nameServiceID);
|
||||||
|
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
|
||||||
|
for (String eachEntry : subChildern) {
|
||||||
|
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
|
||||||
|
byte[] data =
|
||||||
|
rzk.getData(ZKUtil.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
|
||||||
|
null);
|
||||||
|
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
|
||||||
|
activeNamenode = proto.getHostname();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (zkw != null) {
|
||||||
|
zkw.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (activeNamenode == null) {
|
||||||
|
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
|
||||||
|
}
|
||||||
|
LOG.info("Found active namenode host:" + activeNamenode);
|
||||||
|
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
|
||||||
|
LOG.info("Restarting Active NameNode :" + activeNamenode);
|
||||||
|
restartNameNode(activeNNHost, sleepTime);
|
||||||
|
}
|
||||||
|
}
|
|
@ -239,8 +239,44 @@ public abstract class HBaseCluster implements Closeable, Configurable {
|
||||||
throws IOException;
|
throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Starts a new master on the given hostname or if this is a mini/local cluster,
|
* Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
|
||||||
* starts a master locally.
|
* warning message.
|
||||||
|
* @throws IOException if something goes wrong
|
||||||
|
*/
|
||||||
|
public abstract void startNameNode(ServerName serverName) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
|
||||||
|
* exit doing basic clean up only.
|
||||||
|
* @throws IOException if something goes wrong
|
||||||
|
*/
|
||||||
|
public abstract void killNameNode(ServerName serverName) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
|
||||||
|
* @throws IOException if something goes wrong
|
||||||
|
*/
|
||||||
|
public abstract void stopNameNode(ServerName serverName) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for the specified namenode to join the cluster
|
||||||
|
* @return whether the operation finished with success
|
||||||
|
* @throws IOException if something goes wrong or timeout occurs
|
||||||
|
*/
|
||||||
|
public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for the specified namenode to stop
|
||||||
|
* @return whether the operation finished with success
|
||||||
|
* @throws IOException if something goes wrong or timeout occurs
|
||||||
|
*/
|
||||||
|
public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
|
||||||
|
* locally.
|
||||||
* @param hostname the hostname to start the master on
|
* @param hostname the hostname to start the master on
|
||||||
* @return whether the operation finished with success
|
* @return whether the operation finished with success
|
||||||
* @throws IOException if something goes wrong
|
* @throws IOException if something goes wrong
|
||||||
|
|
|
@ -314,6 +314,31 @@ public class MiniHBaseCluster extends HBaseCluster {
|
||||||
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
|
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startNameNode(ServerName serverName) throws IOException {
|
||||||
|
LOG.warn("Starting namenodes on mini cluster is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void killNameNode(ServerName serverName) throws IOException {
|
||||||
|
LOG.warn("Aborting namenodes on mini cluster is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void stopNameNode(ServerName serverName) throws IOException {
|
||||||
|
LOG.warn("Stopping namenodes on mini cluster is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||||
|
LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||||
|
LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startMaster(String hostname, int port) throws IOException {
|
public void startMaster(String hostname, int port) throws IOException {
|
||||||
this.startMaster();
|
this.startMaster();
|
||||||
|
|
Loading…
Reference in New Issue