HBASE-19036 Add action in Chaos Monkey to restart Active Namenode

Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
Monani Mihir 2018-07-31 18:44:45 +05:30 committed by tedyu
parent 78164efcf4
commit b3e41c9525
7 changed files with 227 additions and 1 deletions

View File

@ -25,6 +25,7 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterManager.ServiceType; import org.apache.hadoop.hbase.ClusterManager.ServiceType;
import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Admin;
@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Threads;
import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo;
@ -204,6 +204,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout); waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
} }
@Override
public void startNameNode(ServerName serverName) throws IOException {
LOG.info("Starting name node on: " + serverName.getServerName());
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void killNameNode(ServerName serverName) throws IOException {
LOG.info("Aborting name node on: " + serverName.getServerName());
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void stopNameNode(ServerName serverName) throws IOException {
LOG.info("Stopping name node on: " + serverName.getServerName());
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
serverName.getPort());
}
@Override
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
}
@Override
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
}
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout) private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
throws IOException { throws IOException {
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName()); LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());

View File

@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
Configuration conf = getConf(); Configuration conf = getConf();
switch (service) { switch (service) {
case HADOOP_DATANODE: case HADOOP_DATANODE:
case HADOOP_NAMENODE:
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs"); return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
case ZOOKEEPER_SERVER: case ZOOKEEPER_SERVER:
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper"); return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
protected CommandProvider getCommandProvider(ServiceType service) throws IOException { protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
switch (service) { switch (service) {
case HADOOP_DATANODE: case HADOOP_DATANODE:
case HADOOP_NAMENODE:
return new HadoopShellCommandProvider(getConf()); return new HadoopShellCommandProvider(getConf());
case ZOOKEEPER_SERVER: case ZOOKEEPER_SERVER:
return new ZookeeperShellCommandProvider(getConf()); return new ZookeeperShellCommandProvider(getConf());

View File

@ -26,6 +26,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.function.BiConsumer; import java.util.function.BiConsumer;
import java.util.function.Consumer; import java.util.function.Consumer;
import org.apache.commons.lang3.RandomUtils; import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics; import org.apache.hadoop.hbase.ClusterMetrics;
@ -65,6 +66,10 @@ public class Action {
"hbase.chaosmonkey.action.killdatanodetimeout"; "hbase.chaosmonkey.action.killdatanodetimeout";
public static final String START_DATANODE_TIMEOUT_KEY = public static final String START_DATANODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startdatanodetimeout"; "hbase.chaosmonkey.action.startdatanodetimeout";
public static final String KILL_NAMENODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.killnamenodetimeout";
public static final String START_NAMENODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startnamenodetimeout";
protected static final Logger LOG = LoggerFactory.getLogger(Action.class); protected static final Logger LOG = LoggerFactory.getLogger(Action.class);
@ -76,6 +81,8 @@ public class Action {
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected ActionContext context; protected ActionContext context;
protected HBaseCluster cluster; protected HBaseCluster cluster;
@ -90,6 +97,8 @@ public class Action {
protected long startZkNodeTimeout; protected long startZkNodeTimeout;
protected long killDataNodeTimeout; protected long killDataNodeTimeout;
protected long startDataNodeTimeout; protected long startDataNodeTimeout;
protected long killNameNodeTimeout;
protected long startNameNodeTimeout;
public void init(ActionContext context) throws IOException { public void init(ActionContext context) throws IOException {
this.context = context; this.context = context;
@ -112,6 +121,11 @@ public class Action {
KILL_DATANODE_TIMEOUT_DEFAULT); KILL_DATANODE_TIMEOUT_DEFAULT);
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY, startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
START_DATANODE_TIMEOUT_DEFAULT); START_DATANODE_TIMEOUT_DEFAULT);
killNameNodeTimeout =
cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
startNameNodeTimeout =
cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
} }
public void perform() throws Exception { } public void perform() throws Exception { }
@ -197,6 +211,20 @@ public class Action {
LOG.info("Started datanode " + server); LOG.info("Started datanode " + server);
} }
protected void killNameNode(ServerName server) throws IOException {
LOG.info("Killing namenode :-" + server.getHostname());
cluster.killNameNode(server);
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
}
protected void startNameNode(ServerName server) throws IOException {
LOG.info("Starting Namenode :-" + server.getHostname());
cluster.startNameNode(server);
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
LOG.info("Started namenode:" + server);
}
protected void unbalanceRegions(ClusterMetrics clusterStatus, protected void unbalanceRegions(ClusterMetrics clusterStatus,
List<ServerName> fromServers, List<ServerName> toServers, List<ServerName> fromServers, List<ServerName> toServers,
double fractionOfRegions) throws Exception { double fractionOfRegions) throws Exception {

View File

@ -82,4 +82,16 @@ public class RestartActionBaseAction extends Action {
sleep(sleepTime); sleep(sleepTime);
startDataNode(server); startDataNode(server);
} }
void restartNameNode(ServerName server, long sleepTime) throws IOException {
sleepTime = Math.max(sleepTime, 1000);
// Don't try the kill if we're stopping
if (context.isStopping()) {
return;
}
killNameNode(server);
sleep(sleepTime);
startNameNode(server);
}
} }

View File

@ -0,0 +1,90 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
/**
* Action that tries to restart the active namenode.
*/
public class RestartActiveNameNodeAction extends RestartActionBaseAction {
// Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
// Value taken from org.apache.hadoop.ha.ZKFailoverController.java
// variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
public RestartActiveNameNodeAction(long sleepTime) {
super(sleepTime);
}
@Override
public void perform() throws Exception {
LOG.info("Performing action: Restart active namenode");
Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
throw new Exception("HA for namenode is not enabled");
}
ZKWatcher zkw = null;
RecoverableZooKeeper rzk = null;
String activeNamenode = null;
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
try {
zkw = new ZKWatcher(conf, "get-active-namenode", null);
rzk = zkw.getRecoverableZooKeeper();
String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
for (String eachEntry : subChildern) {
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
byte[] data =
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
null);
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
activeNamenode = proto.getHostname();
}
}
} finally {
if (zkw != null) {
zkw.close();
}
}
if (activeNamenode == null) {
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
}
LOG.info("Found active namenode host:" + activeNamenode);
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
LOG.info("Restarting Active NameNode :" + activeNamenode);
restartNameNode(activeNNHost, sleepTime);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hbase;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Threads;
@ -247,6 +248,42 @@ public abstract class HBaseCluster implements Closeable, Configurable {
public abstract void waitForDataNodeToStop(ServerName serverName, long timeout) public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
throws IOException; throws IOException;
/**
* Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
* warning message.
* @throws IOException if something goes wrong
*/
public abstract void startNameNode(ServerName serverName) throws IOException;
/**
* Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
* exit doing basic clean up only.
* @throws IOException if something goes wrong
*/
public abstract void killNameNode(ServerName serverName) throws IOException;
/**
* Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
* @throws IOException if something goes wrong
*/
public abstract void stopNameNode(ServerName serverName) throws IOException;
/**
* Wait for the specified namenode to join the cluster
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
throws IOException;
/**
* Wait for the specified namenode to stop
* @return whether the operation finished with success
* @throws IOException if something goes wrong or timeout occurs
*/
public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
throws IOException;
/** /**
* Starts a new master on the given hostname or if this is a mini/local cluster, * Starts a new master on the given hostname or if this is a mini/local cluster,
* starts a master locally. * starts a master locally.

View File

@ -24,6 +24,7 @@ import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.HMaster;
@ -348,6 +349,31 @@ public class MiniHBaseCluster extends HBaseCluster {
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported"); LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
} }
@Override
public void startNameNode(ServerName serverName) throws IOException {
LOG.warn("Starting namenodes on mini cluster is not supported");
}
@Override
public void killNameNode(ServerName serverName) throws IOException {
LOG.warn("Aborting namenodes on mini cluster is not supported");
}
@Override
public void stopNameNode(ServerName serverName) throws IOException {
LOG.warn("Stopping namenodes on mini cluster is not supported");
}
@Override
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
}
@Override
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
}
@Override @Override
public void startMaster(String hostname, int port) throws IOException { public void startMaster(String hostname, int port) throws IOException {
this.startMaster(); this.startMaster();