HDFS-9391. Update webUI/JMX to display maintenance state info. (Manoj Govindassamy via mingma)

This commit is contained in:
Ming Ma 2017-01-10 20:12:42 -08:00
parent 4db119b7b5
commit 467f5f1735
8 changed files with 206 additions and 25 deletions

View File

@ -730,7 +730,7 @@ public class DatanodeDescriptor extends DatanodeInfo {
// Super implementation is sufficient
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// Sufficient to use super equality as datanodes are uniquely identified
@ -745,14 +745,14 @@ public class DatanodeDescriptor extends DatanodeInfo {
private int underReplicatedInOpenFiles;
private long startTime;
synchronized void set(int underRep,
int onlyRep, int underConstruction) {
synchronized void set(int underRepInOpenFiles, int underRepBlocks,
int outOfServiceOnlyRep) {
if (!isDecommissionInProgress() && !isEnteringMaintenance()) {
return;
}
underReplicatedBlocks = underRep;
outOfServiceOnlyReplicas = onlyRep;
underReplicatedInOpenFiles = underConstruction;
underReplicatedInOpenFiles = underRepInOpenFiles;
underReplicatedBlocks = underRepBlocks;
outOfServiceOnlyReplicas = outOfServiceOnlyRep;
}
/** @return the number of under-replicated blocks */

View File

@ -634,9 +634,12 @@ public class DecommissionManager {
final List<BlockInfo> insufficientList,
boolean pruneReliableBlocks) {
boolean firstReplicationLog = true;
int lowRedundancyBlocks = 0;
int outOfServiceOnlyReplicas = 0;
// Low redundancy in UC Blocks only
int lowRedundancyInOpenFiles = 0;
// All low redundancy blocks. Includes lowRedundancyInOpenFiles.
int lowRedundancyBlocks = 0;
// All maintenance and decommission replicas.
int outOfServiceOnlyReplicas = 0;
while (it.hasNext()) {
if (insufficientList == null
&& numBlocksCheckedPerLock >= numBlocksPerCheck) {
@ -726,8 +729,8 @@ public class DecommissionManager {
}
}
datanode.getLeavingServiceStatus().set(lowRedundancyBlocks,
outOfServiceOnlyReplicas, lowRedundancyInOpenFiles);
datanode.getLeavingServiceStatus().set(lowRedundancyInOpenFiles,
lowRedundancyBlocks, outOfServiceOnlyReplicas);
}
}

View File

@ -77,7 +77,7 @@ public class NumberReplicas extends EnumCounters<NumberReplicas.StoredReplicaSta
* @return decommissioned and decommissioning replicas
*/
public int decommissionedAndDecommissioning() {
return (int) (get(DECOMMISSIONED) + get(DECOMMISSIONING));
return decommissioned() + decommissioning();
}
/**

View File

@ -5500,6 +5500,7 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
.put("lastContact", getLastContact(node))
.put("decommissioned", node.isDecommissioned())
.put("adminState", node.getAdminState().toString())
.put("xferaddr", node.getXferAddr())
.build();
info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
@ -5524,7 +5525,6 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
.put("xferaddr", node.getXferAddr())
.put("underReplicatedBlocks",
node.getLeavingServiceStatus().getUnderReplicatedBlocks())
// TODO use another property name for outOfServiceOnlyReplicas.
.put("decommissionOnlyReplicas",
node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas())
.put("underReplicateInOpenFiles",
@ -5535,6 +5535,33 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
return JSON.toString(info);
}
/**
* Returned information is a JSON representation of map with host name of
* nodes entering maintenance as the key and value as a map of various node
* attributes to its values.
*/
@Override // NameNodeMXBean
public String getEnteringMaintenanceNodes() {
final Map<String, Map<String, Object>> nodesMap =
new HashMap<String, Map<String, Object>>();
final List<DatanodeDescriptor> enteringMaintenanceNodeList =
blockManager.getDatanodeManager().getEnteringMaintenanceNodes();
for (DatanodeDescriptor node : enteringMaintenanceNodeList) {
Map<String, Object> attrMap = ImmutableMap
.<String, Object> builder()
.put("xferaddr", node.getXferAddr())
.put("underReplicatedBlocks",
node.getLeavingServiceStatus().getUnderReplicatedBlocks())
.put("maintenanceOnlyReplicas",
node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas())
.put("underReplicateInOpenFiles",
node.getLeavingServiceStatus().getUnderReplicatedInOpenFiles())
.build();
nodesMap.put(node.getHostName() + ":" + node.getXferPort(), attrMap);
}
return JSON.toString(nodesMap);
}
private long getLastContact(DatanodeDescriptor alivenode) {
return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
}

View File

@ -190,7 +190,14 @@ public interface NameNodeMXBean {
* @return the decommissioning node information
*/
public String getDecomNodes();
/**
* Gets the information on nodes entering maintenance.
*
* @return the information on nodes entering maintenance
*/
String getEnteringMaintenanceNodes();
/**
* Gets the cluster id.
*

View File

@ -171,9 +171,10 @@
{/nn}
{#fs}
<tr><th><a href="#tab-datanode">Live Nodes</a></th><td>{NumLiveDataNodes} (Decommissioned: {NumDecomLiveDataNodes})</td></tr>
<tr><th><a href="#tab-datanode">Dead Nodes</a></th><td>{NumDeadDataNodes} (Decommissioned: {NumDecomDeadDataNodes})</td></tr>
<tr><th><a href="#tab-datanode">Live Nodes</a></th><td>{NumLiveDataNodes} (Decommissioned: {NumDecomLiveDataNodes}, In Maintenance: {NumInMaintenanceLiveDataNodes})</td></tr>
<tr><th><a href="#tab-datanode">Dead Nodes</a></th><td>{NumDeadDataNodes} (Decommissioned: {NumDecomDeadDataNodes}, In Maintenance: {NumInMaintenanceDeadDataNodes})</td></tr>
<tr><th><a href="#tab-datanode">Decommissioning Nodes</a></th><td>{NumDecommissioningDataNodes}</td></tr>
<tr><th><a href="#tab-datanode">Entering Maintenance Nodes</a></th><td> {NumEnteringMaintenanceDataNodes}</td></tr>
<tr><th><a href="#tab-datanode-volume-failures">Total Datanode Volume Failures</a></th><td>{VolumeFailuresTotal} ({EstimatedCapacityLostTotal|fmt_bytes})</td></tr>
{@eq key=nnstat.State value="active"}
<tr><th title="Excludes missing blocks.">Number of Under-Replicated Blocks</th><td>{UnderReplicatedBlocks}</td></tr>
@ -295,6 +296,7 @@
<li class="dfshealth-node-icon dfshealth-node-down">Down</li>
<li class="dfshealth-node-icon dfshealth-node-decommissioned">Decommissioned</li>
<li class="dfshealth-node-icon dfshealth-node-down-decommissioned">Decommissioned &amp; dead</li>
<li class="dfshealth-node-icon dfshealth-node-down-maintenance">In Maintenance &amp; dead</li>
</ul>
</div>
<div class="page-header"><h1><small>In operation</small></h1></div>
@ -344,6 +346,32 @@
</table>
</small>
<div class="page-header"><h1><small>Entering Maintenance</small></h1></div>
<small>
{?EnteringMaintenanceNodes}
<table class="table">
<thead>
<tr>
<th>Node</th>
<th>Under replicated blocks</th>
<th>Blocks with no live replicas</th>
<th>Under Replicated Blocks <br/>In files under construction</th>
</tr>
</thead>
{#EnteringMaintenanceNodes}
<tr>
<td>{name} ({xferaddr})</td>
<td>{underReplicatedBlocks}</td>
<td>{maintenanceOnlyReplicas}</td>
<td>{underReplicateInOpenFiles}</td>
</tr>
{/EnteringMaintenanceNodes}
</table>
{:else}
No nodes are entering maintenance.
{/EnteringMaintenanceNodes}
</small>
<div class="page-header"><h1><small>Decommissioning</small></h1></div>
<small>
{?DecomNodes}

View File

@ -223,17 +223,23 @@
if (n.adminState === "In Service") {
n.state = "alive";
} else if (nodes[i].adminState === "Decommission In Progress") {
n.state = "decommisioning";
n.state = "decommissioning";
} else if (nodes[i].adminState === "Decommissioned") {
n.state = "decommissioned";
} else if (nodes[i].adminState === "Entering Maintenance") {
n.state = "entering-maintenance";
} else if (nodes[i].adminState === "In Maintenance") {
n.state = "in-maintenance";
}
}
}
function augment_dead_nodes(nodes) {
for (var i = 0, e = nodes.length; i < e; ++i) {
if (nodes[i].decommissioned) {
if (nodes[i].adminState === "Decommissioned") {
nodes[i].state = "down-decommissioned";
} else if (nodes[i].adminState === "In Maintenance") {
nodes[i].state = "down-maintenance";
} else {
nodes[i].state = "down";
}
@ -245,6 +251,7 @@
r.DeadNodes = node_map_to_array(JSON.parse(r.DeadNodes));
augment_dead_nodes(r.DeadNodes);
r.DecomNodes = node_map_to_array(JSON.parse(r.DecomNodes));
r.EnteringMaintenanceNodes = node_map_to_array(JSON.parse(r.EnteringMaintenanceNodes));
return r;
}

View File

@ -37,8 +37,10 @@ import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
import org.apache.hadoop.hdfs.server.blockmanagement.CombinedHostFileManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.blockmanagement.HostConfigManager;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
@ -50,10 +52,13 @@ import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.io.nativeio.NativeIO.POSIX.NoMlockCacheManipulator;
import org.apache.hadoop.net.ServerSocketUtil;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.VersionInfo;
import org.junit.Assert;
import org.junit.Test;
import org.eclipse.jetty.util.ajax.JSON;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.management.MBeanServer;
import javax.management.ObjectName;
@ -64,6 +69,7 @@ import java.net.BindException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@ -80,6 +86,9 @@ import static org.junit.Assert.fail;
*/
public class TestNameNodeMXBean {
private static final Logger LOG =
LoggerFactory.getLogger(TestNameNodeMXBean.class);
/**
* Used to assert equality between doubles
*/
@ -180,10 +189,10 @@ public class TestNameNodeMXBean {
assertFalse(xferAddr.equals(dnXferAddrInMaintenance) ^ inMaintenance);
}
assertEquals(fsn.getLiveNodes(), alivenodeinfo);
// get attribute deadnodeinfo
String deadnodeinfo = (String) (mbs.getAttribute(mxbeanName,
// get attributes DeadNodes
String deadNodeInfo = (String) (mbs.getAttribute(mxbeanName,
"DeadNodes"));
assertEquals(fsn.getDeadNodes(), deadnodeinfo);
assertEquals(fsn.getDeadNodes(), deadNodeInfo);
// get attribute NodeUsage
String nodeUsage = (String) (mbs.getAttribute(mxbeanName,
"NodeUsage"));
@ -295,16 +304,16 @@ public class TestNameNodeMXBean {
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
// get attribute deadnodeinfo
String deadnodeinfo = (String) (mbs.getAttribute(mxbeanName,
// get attribute DeadNodes
String deadNodeInfo = (String) (mbs.getAttribute(mxbeanName,
"DeadNodes"));
assertEquals(fsn.getDeadNodes(), deadnodeinfo);
assertEquals(fsn.getDeadNodes(), deadNodeInfo);
Map<String, Map<String, Object>> deadNodes =
(Map<String, Map<String, Object>>) JSON.parse(deadnodeinfo);
(Map<String, Map<String, Object>>) JSON.parse(deadNodeInfo);
assertTrue(deadNodes.size() > 0);
for (Map<String, Object> deadNode : deadNodes.values()) {
assertTrue(deadNode.containsKey("lastContact"));
assertTrue(deadNode.containsKey("decommissioned"));
assertTrue(deadNode.containsKey("adminState"));
assertTrue(deadNode.containsKey("xferaddr"));
}
} finally {
@ -415,6 +424,106 @@ public class TestNameNodeMXBean {
}
}
@Test (timeout = 120000)
public void testMaintenanceNodes() throws Exception {
LOG.info("Starting testMaintenanceNodes");
int expirationInMs = 30 * 1000;
Configuration conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
expirationInMs);
conf.setClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY,
CombinedHostFileManager.class, HostConfigManager.class);
MiniDFSCluster cluster = null;
HostsFileWriter hostsFileWriter = new HostsFileWriter();
hostsFileWriter.initialize(conf, "temp/TestNameNodeMXBean");
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
cluster.waitActive();
FSNamesystem fsn = cluster.getNameNode().namesystem;
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
ObjectName mxbeanName = new ObjectName(
"Hadoop:service=NameNode,name=NameNodeInfo");
List<String> hosts = new ArrayList<>();
for(DataNode dn : cluster.getDataNodes()) {
hosts.add(dn.getDisplayName());
}
hostsFileWriter.initIncludeHosts(hosts.toArray(
new String[hosts.size()]));
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
// 1. Verify nodes for DatanodeReportType.LIVE state
String liveNodesInfo = (String) (mbs.getAttribute(mxbeanName,
"LiveNodes"));
LOG.info("Live Nodes: " + liveNodesInfo);
Map<String, Map<String, Object>> liveNodes =
(Map<String, Map<String, Object>>) JSON.parse(liveNodesInfo);
assertEquals(fsn.getLiveNodes(), liveNodesInfo);
assertEquals(fsn.getNumLiveDataNodes(), liveNodes.size());
for (Map<String, Object> liveNode : liveNodes.values()) {
assertTrue(liveNode.containsKey("lastContact"));
assertTrue(liveNode.containsKey("xferaddr"));
}
// Add the 1st DataNode to Maintenance list
Map<String, Long> maintenanceNodes = new HashMap<>();
maintenanceNodes.put(cluster.getDataNodes().get(0).getDisplayName(),
Time.monotonicNow() + expirationInMs);
hostsFileWriter.initOutOfServiceHosts(null, maintenanceNodes);
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
boolean recheck = true;
while (recheck) {
// 2. Verify nodes for DatanodeReportType.ENTERING_MAINTENANCE state
String enteringMaintenanceNodesInfo =
(String) (mbs.getAttribute(mxbeanName, "EnteringMaintenanceNodes"));
Map<String, Map<String, Object>> enteringMaintenanceNodes =
(Map<String, Map<String, Object>>) JSON.parse(
enteringMaintenanceNodesInfo);
if (enteringMaintenanceNodes.size() <= 0) {
LOG.info("Waiting for a node to Enter Maintenance state!");
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
continue;
}
LOG.info("Nodes entering Maintenance: " + enteringMaintenanceNodesInfo);
recheck = false;
assertEquals(fsn.getEnteringMaintenanceNodes(),
enteringMaintenanceNodesInfo);
assertEquals(fsn.getNumEnteringMaintenanceDataNodes(),
enteringMaintenanceNodes.size());
assertEquals(0, fsn.getNumInMaintenanceLiveDataNodes());
assertEquals(0, fsn.getNumInMaintenanceDeadDataNodes());
}
// Wait for the DecommissionManager to complete check
// and perform state transition
while (fsn.getNumInMaintenanceLiveDataNodes() != 1) {
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
// 3. Verify nodes for AdminStates.IN_MAINTENANCE state
String enteringMaintenanceNodesInfo =
(String) (mbs.getAttribute(mxbeanName, "EnteringMaintenanceNodes"));
Map<String, Map<String, Object>> enteringMaintenanceNodes =
(Map<String, Map<String, Object>>) JSON.parse(
enteringMaintenanceNodesInfo);
assertEquals(0, enteringMaintenanceNodes.size());
assertEquals(fsn.getEnteringMaintenanceNodes(),
enteringMaintenanceNodesInfo);
assertEquals(1, fsn.getNumInMaintenanceLiveDataNodes());
assertEquals(0, fsn.getNumInMaintenanceDeadDataNodes());
} finally {
if (cluster != null) {
cluster.shutdown();
}
hostsFileWriter.cleanup();
}
}
@Test(timeout=120000)
@SuppressWarnings("unchecked")
public void testTopUsers() throws Exception {