HBASE-19036 Add action in Chaos Monkey to restart Active Namenode
Signed-off-by: tedyu <yuzhihong@gmail.com>
This commit is contained in:
parent
78164efcf4
commit
b3e41c9525
|
@ -25,6 +25,7 @@ import java.util.List;
|
|||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.ClusterManager.ServiceType;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
|
@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionLocator;
|
|||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo;
|
||||
|
@ -204,6 +204,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startNameNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Starting name node on: " + serverName.getServerName());
|
||||
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||
serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void killNameNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Aborting name node on: " + serverName.getServerName());
|
||||
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||
serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopNameNode(ServerName serverName) throws IOException {
|
||||
LOG.info("Stopping name node on: " + serverName.getServerName());
|
||||
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||
serverName.getPort());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||
waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||
waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
|
||||
}
|
||||
|
||||
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
||||
throws IOException {
|
||||
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
|
||||
|
|
|
@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
Configuration conf = getConf();
|
||||
switch (service) {
|
||||
case HADOOP_DATANODE:
|
||||
case HADOOP_NAMENODE:
|
||||
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
|
||||
case ZOOKEEPER_SERVER:
|
||||
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
|
||||
|
@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
|
|||
protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
|
||||
switch (service) {
|
||||
case HADOOP_DATANODE:
|
||||
case HADOOP_NAMENODE:
|
||||
return new HadoopShellCommandProvider(getConf());
|
||||
case ZOOKEEPER_SERVER:
|
||||
return new ZookeeperShellCommandProvider(getConf());
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.lang3.RandomUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.ClusterMetrics;
|
||||
|
@ -65,6 +66,10 @@ public class Action {
|
|||
"hbase.chaosmonkey.action.killdatanodetimeout";
|
||||
public static final String START_DATANODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.startdatanodetimeout";
|
||||
public static final String KILL_NAMENODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.killnamenodetimeout";
|
||||
public static final String START_NAMENODE_TIMEOUT_KEY =
|
||||
"hbase.chaosmonkey.action.startnamenodetimeout";
|
||||
|
||||
protected static final Logger LOG = LoggerFactory.getLogger(Action.class);
|
||||
|
||||
|
@ -76,6 +81,8 @@ public class Action {
|
|||
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
|
||||
|
||||
protected ActionContext context;
|
||||
protected HBaseCluster cluster;
|
||||
|
@ -90,6 +97,8 @@ public class Action {
|
|||
protected long startZkNodeTimeout;
|
||||
protected long killDataNodeTimeout;
|
||||
protected long startDataNodeTimeout;
|
||||
protected long killNameNodeTimeout;
|
||||
protected long startNameNodeTimeout;
|
||||
|
||||
public void init(ActionContext context) throws IOException {
|
||||
this.context = context;
|
||||
|
@ -112,6 +121,11 @@ public class Action {
|
|||
KILL_DATANODE_TIMEOUT_DEFAULT);
|
||||
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
|
||||
START_DATANODE_TIMEOUT_DEFAULT);
|
||||
killNameNodeTimeout =
|
||||
cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
|
||||
startNameNodeTimeout =
|
||||
cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
|
||||
|
||||
}
|
||||
|
||||
public void perform() throws Exception { }
|
||||
|
@ -197,6 +211,20 @@ public class Action {
|
|||
LOG.info("Started datanode " + server);
|
||||
}
|
||||
|
||||
protected void killNameNode(ServerName server) throws IOException {
|
||||
LOG.info("Killing namenode :-" + server.getHostname());
|
||||
cluster.killNameNode(server);
|
||||
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
|
||||
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
|
||||
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||
}
|
||||
|
||||
protected void startNameNode(ServerName server) throws IOException {
|
||||
LOG.info("Starting Namenode :-" + server.getHostname());
|
||||
cluster.startNameNode(server);
|
||||
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
|
||||
LOG.info("Started namenode:" + server);
|
||||
}
|
||||
protected void unbalanceRegions(ClusterMetrics clusterStatus,
|
||||
List<ServerName> fromServers, List<ServerName> toServers,
|
||||
double fractionOfRegions) throws Exception {
|
||||
|
|
|
@ -82,4 +82,16 @@ public class RestartActionBaseAction extends Action {
|
|||
sleep(sleepTime);
|
||||
startDataNode(server);
|
||||
}
|
||||
|
||||
void restartNameNode(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the kill if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
killNameNode(server);
|
||||
sleep(sleepTime);
|
||||
startNameNode(server);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
|
||||
import org.apache.hadoop.hdfs.DFSUtil;
|
||||
import org.apache.hadoop.hdfs.HAUtil;
|
||||
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
|
||||
|
||||
/**
|
||||
* Action that tries to restart the active namenode.
|
||||
*/
|
||||
public class RestartActiveNameNodeAction extends RestartActionBaseAction {
|
||||
|
||||
// Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
|
||||
private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
|
||||
|
||||
// Value taken from org.apache.hadoop.ha.ZKFailoverController.java
|
||||
// variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
|
||||
private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
|
||||
private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
|
||||
|
||||
public RestartActiveNameNodeAction(long sleepTime) {
|
||||
super(sleepTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
LOG.info("Performing action: Restart active namenode");
|
||||
Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
|
||||
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
|
||||
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
|
||||
throw new Exception("HA for namenode is not enabled");
|
||||
}
|
||||
ZKWatcher zkw = null;
|
||||
RecoverableZooKeeper rzk = null;
|
||||
String activeNamenode = null;
|
||||
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
|
||||
try {
|
||||
zkw = new ZKWatcher(conf, "get-active-namenode", null);
|
||||
rzk = zkw.getRecoverableZooKeeper();
|
||||
String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
|
||||
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
|
||||
for (String eachEntry : subChildern) {
|
||||
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
|
||||
byte[] data =
|
||||
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
|
||||
null);
|
||||
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
|
||||
activeNamenode = proto.getHostname();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (zkw != null) {
|
||||
zkw.close();
|
||||
}
|
||||
}
|
||||
if (activeNamenode == null) {
|
||||
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
|
||||
}
|
||||
LOG.info("Found active namenode host:" + activeNamenode);
|
||||
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
|
||||
LOG.info("Restarting Active NameNode :" + activeNamenode);
|
||||
restartNameNode(activeNNHost, sleepTime);
|
||||
}
|
||||
}
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.hbase;
|
|||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
|
@ -247,6 +248,42 @@ public abstract class HBaseCluster implements Closeable, Configurable {
|
|||
public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
|
||||
* warning message.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void startNameNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
|
||||
* exit doing basic clean up only.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void killNameNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
|
||||
* @throws IOException if something goes wrong
|
||||
*/
|
||||
public abstract void stopNameNode(ServerName serverName) throws IOException;
|
||||
|
||||
/**
|
||||
* Wait for the specified namenode to join the cluster
|
||||
* @return whether the operation finished with success
|
||||
* @throws IOException if something goes wrong or timeout occurs
|
||||
*/
|
||||
public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Wait for the specified namenode to stop
|
||||
* @return whether the operation finished with success
|
||||
* @throws IOException if something goes wrong or timeout occurs
|
||||
*/
|
||||
public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Starts a new master on the given hostname or if this is a mini/local cluster,
|
||||
* starts a master locally.
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.ArrayList;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
|
@ -348,6 +349,31 @@ public class MiniHBaseCluster extends HBaseCluster {
|
|||
LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startNameNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Starting namenodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void killNameNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Aborting namenodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopNameNode(ServerName serverName) throws IOException {
|
||||
LOG.warn("Stopping namenodes on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
|
||||
LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
|
||||
LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startMaster(String hostname, int port) throws IOException {
|
||||
this.startMaster();
|
||||
|
|
Loading…
Reference in New Issue