HBASE-19036 Add action in Chaos Monkey to restart Active Namenode

Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
Monani Mihir 2018-07-31 18:44:45 +05:30 committed by tedyu
parent 690d29bae7
commit 06a92a3d20
7 changed files with 227 additions and 1 deletions

View File

@ -25,6 +25,7 @@ import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterManager.ServiceType;
import org.apache.hadoop.hbase.client.Admin;
@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo;
@ -204,6 +204,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
}
@Override
public void startNameNode(ServerName serverName) throws IOException {
LOG.info("Starting name node on: " + serverName.getServerName());
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void killNameNode(ServerName serverName) throws IOException {
LOG.info("Aborting name node on: " + serverName.getServerName());
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void stopNameNode(ServerName serverName) throws IOException {
LOG.info("Stopping name node on: " + serverName.getServerName());
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
}
@Override
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
}
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
throws IOException {
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());

View File

@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
Configuration conf = getConf();
switch (service) {
case HADOOP_DATANODE:
case HADOOP_NAMENODE:
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
case ZOOKEEPER_SERVER:
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
switch (service) {
case HADOOP_DATANODE:
case HADOOP_NAMENODE:
return new HadoopShellCommandProvider(getConf());
case ZOOKEEPER_SERVER:
return new ZookeeperShellCommandProvider(getConf());

View File

@ -26,6 +26,7 @@ import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics;
@ -65,6 +66,10 @@ public class Action {
"hbase.chaosmonkey.action.killdatanodetimeout";
public static final String START_DATANODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startdatanodetimeout";
public static final String KILL_NAMENODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.killnamenodetimeout";
public static final String START_NAMENODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startnamenodetimeout";
protected static final Logger LOG = LoggerFactory.getLogger(Action.class);
@ -76,6 +81,8 @@ public class Action {
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected ActionContext context;
protected HBaseCluster cluster;
@ -90,6 +97,8 @@ public class Action {
protected long startZkNodeTimeout;
protected long killDataNodeTimeout;
protected long startDataNodeTimeout;
protected long killNameNodeTimeout;
protected long startNameNodeTimeout;
public void init(ActionContext context) throws IOException {
this.context = context;
@ -112,6 +121,11 @@ public class Action {
KILL_DATANODE_TIMEOUT_DEFAULT);
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
START_DATANODE_TIMEOUT_DEFAULT);
killNameNodeTimeout =
cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
startNameNodeTimeout =
cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
}
public void perform() throws Exception { }
@ -197,6 +211,20 @@ public class Action {
LOG.info("Started datanode " + server);
}
protected void killNameNode(ServerName server) throws IOException {
LOG.info("Killing namenode :-" + server.getHostname());
cluster.killNameNode(server);
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
}
protected void startNameNode(ServerName server) throws IOException {
LOG.info("Starting Namenode :-" + server.getHostname());
cluster.startNameNode(server);
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
LOG.info("Started namenode:" + server);
}
protected void unbalanceRegions(ClusterMetrics clusterStatus,
List<ServerName> fromServers, List<ServerName> toServers,
double fractionOfRegions) throws Exception {

View File

@ -82,4 +82,16 @@ public class RestartActionBaseAction extends Action {
sleep(sleepTime);
startDataNode(server);
}
void restartNameNode(ServerName server, long sleepTime) throws IOException {
sleepTime = Math.max(sleepTime, 1000);
// Don't try the kill if we're stopping
if (context.isStopping()) {
return;
}
killNameNode(server);
sleep(sleepTime);
startNameNode(server);
}
}

View File

@ -0,0 +1,90 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
/**
* Action that tries to restart the active namenode.
*/
public class RestartActiveNameNodeAction extends RestartActionBaseAction {
// Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
// Value taken from org.apache.hadoop.ha.ZKFailoverController.java
// variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
public RestartActiveNameNodeAction(long sleepTime) {
super(sleepTime);
}
@Override
public void perform() throws Exception {
LOG.info("Performing action: Restart active namenode");
Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
throw new Exception("HA for namenode is not enabled");
}
ZKWatcher zkw = null;
RecoverableZooKeeper rzk = null;
String activeNamenode = null;
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
try {
zkw = new ZKWatcher(conf, "get-active-namenode", null);
rzk = zkw.getRecoverableZooKeeper();
String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
for (String eachEntry : subChildern) {
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
byte[] data =
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
null);
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
activeNamenode = proto.getHostname();
}
}
} finally {
if (zkw != null) {
zkw.close();
}
}
if (activeNamenode == null) {
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
}
LOG.info("Found active namenode host:" + activeNamenode);
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
LOG.info("Restarting Active NameNode :" + activeNamenode);
restartNameNode(activeNNHost, sleepTime);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hbase;
import java.io.Closeable;
import java.io.IOException;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.util.Threads;
@ -247,6 +248,42 @@ public abstract class HBaseCluster implements Closeable, Configurable {
public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
throws IOException;
/**
* Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
* warning message.
* @throws IOException if something goes wrong
*/
public abstract void startNameNode(ServerName serverName) throws IOException;
/**
* Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
* exit doing basic clean up only.
* @throws IOException if something goes wrong
*/
public abstract void killNameNode(ServerName serverName) throws IOException;
/**
* Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
* @throws IOException if something goes wrong
*/
public abstract void stopNameNode(ServerName serverName) throws IOException;
/**
* Wait for the specified namenode to join the cluster
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
throws IOException;
/**
* Wait for the specified namenode to stop
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
throws IOException;
/**
* Starts a new master on the given hostname or if this is a mini/local cluster,
* starts a master locally.

View File

@ -24,6 +24,7 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.master.HMaster;
@ -348,6 +349,31 @@ public class MiniHBaseCluster extends HBaseCluster {
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
}
@Override
public void startNameNode(ServerName serverName) throws IOException {
LOG.warn("Starting namenodes on mini cluster is not supported");
}
@Override
public void killNameNode(ServerName serverName) throws IOException {
LOG.warn("Aborting namenodes on mini cluster is not supported");
}
@Override
public void stopNameNode(ServerName serverName) throws IOException {
LOG.warn("Stopping namenodes on mini cluster is not supported");
}
@Override
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
}
@Override
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
}
@Override
public void startMaster(String hostname, int port) throws IOException {
this.startMaster();