HBASE-27567 Introduce ChaosMonkey Action to print HDFS Cluster status
Signed-off-by: Reid Chan <reidchan@apache.org> Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
ae1ec90680
commit
6244891df9
@ -0,0 +1,78 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
|
import java.net.InetSocketAddress;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||||
|
import org.apache.hadoop.hdfs.HAUtil;
|
||||||
|
import org.apache.hadoop.hdfs.HAUtilClient;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
public class DumpHdfsClusterStatusAction extends Action {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(DumpHdfsClusterStatusAction.class);
|
||||||
|
private static final String PREFIX = "\n ";
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Logger getLogger() {
|
||||||
|
return LOG;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void perform() throws Exception {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
|
||||||
|
final Configuration dfsConf = dfs.getConf();
|
||||||
|
final URI dfsUri = dfs.getUri();
|
||||||
|
final boolean isHaAndLogicalUri = HAUtilClient.isLogicalUri(dfsConf, dfsUri);
|
||||||
|
sb.append("Cluster status").append('\n');
|
||||||
|
if (isHaAndLogicalUri) {
|
||||||
|
final String nsId = dfsUri.getHost();
|
||||||
|
final List<ClientProtocol> namenodes =
|
||||||
|
HAUtil.getProxiesForAllNameNodesInNameservice(dfsConf, nsId);
|
||||||
|
final boolean atLeastOneActive = HAUtil.isAtLeastOneActive(namenodes);
|
||||||
|
final InetSocketAddress activeAddress = HAUtil.getAddressOfActive(dfs);
|
||||||
|
sb.append("Active NameNode=").append(activeAddress).append(", isAtLeastOneActive=")
|
||||||
|
.append(atLeastOneActive).append('\n');
|
||||||
|
}
|
||||||
|
DatanodeInfo[] dns = dfs.getClient().datanodeReport(HdfsConstants.DatanodeReportType.LIVE);
|
||||||
|
sb.append("Number of live DataNodes: ").append(dns.length);
|
||||||
|
for (DatanodeInfo dni : dns) {
|
||||||
|
sb.append(PREFIX).append("name=").append(dni.getName()).append(", used%=")
|
||||||
|
.append(dni.getDfsUsedPercent()).append(", capacity=")
|
||||||
|
.append(FileUtils.byteCountToDisplaySize(dni.getCapacity()));
|
||||||
|
}
|
||||||
|
sb.append('\n');
|
||||||
|
dns = dfs.getClient().datanodeReport(HdfsConstants.DatanodeReportType.DEAD);
|
||||||
|
sb.append("Number of dead DataNodes: ").append(dns.length);
|
||||||
|
for (DatanodeInfo dni : dns) {
|
||||||
|
sb.append(PREFIX).append(dni.getName()).append("/").append(dni.getNetworkLocation());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: add more on NN, JNs, and ZK.
|
||||||
|
// TODO: Print how long process has been up.
|
||||||
|
getLogger().info(sb.toString());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InterruptedIOException;
|
||||||
|
import java.security.PrivilegedExceptionAction;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.hbase.util.CommonFSUtils;
|
||||||
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||||
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration common across the HDFS Actions.
|
||||||
|
*/
|
||||||
|
public final class HdfsActionUtils {
|
||||||
|
|
||||||
|
private HdfsActionUtils() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Specify a user as whom HDFS actions should be run. The chaos process must have permissions
|
||||||
|
* sufficient to assume the role of the specified user.
|
||||||
|
* @see <a href=
|
||||||
|
* "https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/Superusers.html">Proxy
|
||||||
|
* user - Superusers Acting On Behalf Of Other Users</a>
|
||||||
|
*/
|
||||||
|
public static final String HDFS_USER_CONF_KEY = "org.apache.hadoop.hbase.chaos.actions.hdfs_user";
|
||||||
|
|
||||||
|
private static DistributedFileSystem createUnproxiedDfs(final Configuration conf)
|
||||||
|
throws IOException {
|
||||||
|
final Path rootDir = CommonFSUtils.getRootDir(conf);
|
||||||
|
final FileSystem fs = rootDir.getFileSystem(conf);
|
||||||
|
return (DistributedFileSystem) fs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an instance of {@link DistributedFileSystem} that honors {@value HDFS_USER_CONF_KEY}.
|
||||||
|
*/
|
||||||
|
static DistributedFileSystem createDfs(final Configuration conf) throws IOException {
|
||||||
|
final String proxyUser = conf.get(HDFS_USER_CONF_KEY);
|
||||||
|
if (proxyUser == null) {
|
||||||
|
return createUnproxiedDfs(conf);
|
||||||
|
}
|
||||||
|
final UserGroupInformation proxyUgi =
|
||||||
|
UserGroupInformation.createProxyUser(proxyUser, UserGroupInformation.getLoginUser());
|
||||||
|
try {
|
||||||
|
return proxyUgi
|
||||||
|
.doAs((PrivilegedExceptionAction<DistributedFileSystem>) () -> createUnproxiedDfs(conf));
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
final InterruptedIOException iioe = new InterruptedIOException(e.getMessage());
|
||||||
|
iioe.setStackTrace(e.getStackTrace());
|
||||||
|
throw iioe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -17,15 +17,17 @@
|
|||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hbase.chaos.actions;
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.util.CommonFSUtils;
|
|
||||||
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
|
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
|
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
|
||||||
import org.apache.hadoop.hdfs.DFSUtil;
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||||
import org.apache.hadoop.hdfs.HAUtil;
|
import org.apache.hadoop.hdfs.HAUtil;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
|
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -57,39 +59,51 @@ public class RestartActiveNameNodeAction extends RestartActionBaseAction {
|
|||||||
@Override
|
@Override
|
||||||
public void perform() throws Exception {
|
public void perform() throws Exception {
|
||||||
getLogger().info("Performing action: Restart active namenode");
|
getLogger().info("Performing action: Restart active namenode");
|
||||||
Configuration conf = CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
|
|
||||||
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
|
final String hadoopHAZkNode;
|
||||||
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
|
|
||||||
throw new Exception("HA for namenode is not enabled");
|
|
||||||
}
|
|
||||||
ZKWatcher zkw = null;
|
|
||||||
RecoverableZooKeeper rzk = null;
|
|
||||||
String activeNamenode = null;
|
String activeNamenode = null;
|
||||||
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
|
int activeNamenodePort = -1;
|
||||||
try {
|
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
|
||||||
zkw = new ZKWatcher(conf, "get-active-namenode", null);
|
final Configuration conf = dfs.getConf();
|
||||||
rzk = zkw.getRecoverableZooKeeper();
|
hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
|
||||||
String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
|
final String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
|
||||||
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
|
|
||||||
for (String eachEntry : subChildern) {
|
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
|
||||||
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
|
getLogger().info("HA for HDFS is not enabled; skipping");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try (final ZKWatcher zkw = new ZKWatcher(conf, "get-active-namenode", null)) {
|
||||||
|
final RecoverableZooKeeper rzk = zkw.getRecoverableZooKeeper();
|
||||||
|
// If hadoopHAZkNode == '/', pass '' instead because then joinZNode will return '//' as a
|
||||||
|
// prefix
|
||||||
|
// which zk doesn't like as a prefix on the path.
|
||||||
|
final String hadoopHAZkNodePath = ZNodePaths.joinZNode(
|
||||||
|
(hadoopHAZkNode != null && hadoopHAZkNode.equals("/")) ? "" : hadoopHAZkNode,
|
||||||
|
nameServiceID);
|
||||||
|
final List<String> subChildren =
|
||||||
|
Optional.ofNullable(ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath))
|
||||||
|
.orElse(Collections.emptyList());
|
||||||
|
for (final String eachEntry : subChildren) {
|
||||||
|
if (!eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
byte[] data =
|
byte[] data =
|
||||||
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, null);
|
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, null);
|
||||||
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
|
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
|
||||||
activeNamenode = proto.getHostname();
|
activeNamenode = proto.getHostname();
|
||||||
|
activeNamenodePort = proto.getPort();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
if (zkw != null) {
|
|
||||||
zkw.close();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (activeNamenode == null) {
|
if (activeNamenode == null) {
|
||||||
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
|
getLogger().info("No active Name node found in zookeeper under '{}'", hadoopHAZkNode);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
getLogger().info("Found active namenode host:" + activeNamenode);
|
|
||||||
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
|
getLogger().info("Found Active NameNode host: {}", activeNamenode);
|
||||||
getLogger().info("Restarting Active NameNode :" + activeNamenode);
|
final ServerName activeNNHost = ServerName.valueOf(activeNamenode, activeNamenodePort, -1L);
|
||||||
restartNameNode(activeNNHost, sleepTime);
|
getLogger().info("Restarting Active NameNode: {}", activeNamenode);
|
||||||
|
restartNameNode(activeNNHost, this.sleepTime);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,14 +18,11 @@
|
|||||||
package org.apache.hadoop.hbase.chaos.actions;
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.LinkedList;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||||
import org.apache.hadoop.hbase.util.CommonFSUtils;
|
|
||||||
import org.apache.hadoop.hdfs.DFSClient;
|
import org.apache.hadoop.hdfs.DFSClient;
|
||||||
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
|
||||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -48,18 +45,15 @@ public class RestartRandomDataNodeAction extends RestartActionBaseAction {
|
|||||||
@Override
|
@Override
|
||||||
public void perform() throws Exception {
|
public void perform() throws Exception {
|
||||||
getLogger().info("Performing action: Restart random data node");
|
getLogger().info("Performing action: Restart random data node");
|
||||||
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
|
final ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
|
||||||
restartDataNode(server, sleepTime);
|
restartDataNode(server, sleepTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ServerName[] getDataNodes() throws IOException {
|
private ServerName[] getDataNodes() throws IOException {
|
||||||
DistributedFileSystem fs =
|
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
|
||||||
(DistributedFileSystem) CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf());
|
final DFSClient dfsClient = dfs.getClient();
|
||||||
DFSClient dfsClient = fs.getClient();
|
return Arrays.stream(dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE))
|
||||||
List<ServerName> hosts = new LinkedList<>();
|
.map(dn -> ServerName.valueOf(dn.getHostName(), -1, -1)).toArray(ServerName[]::new);
|
||||||
for (DatanodeInfo dataNode : dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) {
|
|
||||||
hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1));
|
|
||||||
}
|
}
|
||||||
return hosts.toArray(new ServerName[0]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,9 +19,11 @@ package org.apache.hadoop.hbase.chaos.factories;
|
|||||||
|
|
||||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.DumpHdfsClusterStatusAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
|
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.RestartActiveNameNodeAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
|
||||||
@ -55,6 +57,7 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||||||
// only allow 2 servers to be dead.
|
// only allow 2 servers to be dead.
|
||||||
new RollingBatchRestartRsAction(5000, 1.0f, 2, true),
|
new RollingBatchRestartRsAction(5000, 1.0f, 2, true),
|
||||||
new ForceBalancerAction(),
|
new ForceBalancerAction(),
|
||||||
|
new RestartActiveNameNodeAction(60000),
|
||||||
new RestartRandomDataNodeAction(60000),
|
new RestartRandomDataNodeAction(60000),
|
||||||
new RestartRandomZKNodeAction(60000),
|
new RestartRandomZKNodeAction(60000),
|
||||||
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
|
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
|
||||||
@ -64,7 +67,8 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||||||
// @formatter:on
|
// @formatter:on
|
||||||
|
|
||||||
// Action to log more info for debugging
|
// Action to log more info for debugging
|
||||||
Action[] actions2 = new Action[] { new DumpClusterStatusAction() };
|
Action[] actions2 =
|
||||||
|
new Action[] { new DumpClusterStatusAction(), new DumpHdfsClusterStatusAction() };
|
||||||
|
|
||||||
return new PolicyBasedChaosMonkey(properties, util,
|
return new PolicyBasedChaosMonkey(properties, util,
|
||||||
new CompositeSequentialPolicy(new DoActionsOncePolicy(60 * 1000, actions1),
|
new CompositeSequentialPolicy(new DoActionsOncePolicy(60 * 1000, actions1),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user