HDFS-9391. Update webUI/JMX to display maintenance state info. (Manoj Govindassamy via mingma)

This commit is contained in:
Ming Ma 2017-01-10 20:12:42 -08:00
parent 4db119b7b5
commit 467f5f1735
8 changed files with 206 additions and 25 deletions

View File

@ -730,7 +730,7 @@ public class DatanodeDescriptor extends DatanodeInfo {
// Super implementation is sufficient // Super implementation is sufficient
return super.hashCode(); return super.hashCode();
} }
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
// Sufficient to use super equality as datanodes are uniquely identified // Sufficient to use super equality as datanodes are uniquely identified
@ -745,14 +745,14 @@ public class DatanodeDescriptor extends DatanodeInfo {
private int underReplicatedInOpenFiles; private int underReplicatedInOpenFiles;
private long startTime; private long startTime;
synchronized void set(int underRep, synchronized void set(int underRepInOpenFiles, int underRepBlocks,
int onlyRep, int underConstruction) { int outOfServiceOnlyRep) {
if (!isDecommissionInProgress() && !isEnteringMaintenance()) { if (!isDecommissionInProgress() && !isEnteringMaintenance()) {
return; return;
} }
underReplicatedBlocks = underRep; underReplicatedInOpenFiles = underRepInOpenFiles;
outOfServiceOnlyReplicas = onlyRep; underReplicatedBlocks = underRepBlocks;
underReplicatedInOpenFiles = underConstruction; outOfServiceOnlyReplicas = outOfServiceOnlyRep;
} }
/** @return the number of under-replicated blocks */ /** @return the number of under-replicated blocks */

View File

@ -634,9 +634,12 @@ public class DecommissionManager {
final List<BlockInfo> insufficientList, final List<BlockInfo> insufficientList,
boolean pruneReliableBlocks) { boolean pruneReliableBlocks) {
boolean firstReplicationLog = true; boolean firstReplicationLog = true;
int lowRedundancyBlocks = 0; // Low redundancy in UC Blocks only
int outOfServiceOnlyReplicas = 0;
int lowRedundancyInOpenFiles = 0; int lowRedundancyInOpenFiles = 0;
// All low redundancy blocks. Includes lowRedundancyInOpenFiles.
int lowRedundancyBlocks = 0;
// All maintenance and decommission replicas.
int outOfServiceOnlyReplicas = 0;
while (it.hasNext()) { while (it.hasNext()) {
if (insufficientList == null if (insufficientList == null
&& numBlocksCheckedPerLock >= numBlocksPerCheck) { && numBlocksCheckedPerLock >= numBlocksPerCheck) {
@ -726,8 +729,8 @@ public class DecommissionManager {
} }
} }
datanode.getLeavingServiceStatus().set(lowRedundancyBlocks, datanode.getLeavingServiceStatus().set(lowRedundancyInOpenFiles,
outOfServiceOnlyReplicas, lowRedundancyInOpenFiles); lowRedundancyBlocks, outOfServiceOnlyReplicas);
} }
} }

View File

@ -77,7 +77,7 @@ public class NumberReplicas extends EnumCounters<NumberReplicas.StoredReplicaSta
* @return decommissioned and decommissioning replicas * @return decommissioned and decommissioning replicas
*/ */
public int decommissionedAndDecommissioning() { public int decommissionedAndDecommissioning() {
return (int) (get(DECOMMISSIONED) + get(DECOMMISSIONING)); return decommissioned() + decommissioning();
} }
/** /**

View File

@ -5500,6 +5500,7 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder() Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
.put("lastContact", getLastContact(node)) .put("lastContact", getLastContact(node))
.put("decommissioned", node.isDecommissioned()) .put("decommissioned", node.isDecommissioned())
.put("adminState", node.getAdminState().toString())
.put("xferaddr", node.getXferAddr()) .put("xferaddr", node.getXferAddr())
.build(); .build();
info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo); info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
@ -5524,7 +5525,6 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
.put("xferaddr", node.getXferAddr()) .put("xferaddr", node.getXferAddr())
.put("underReplicatedBlocks", .put("underReplicatedBlocks",
node.getLeavingServiceStatus().getUnderReplicatedBlocks()) node.getLeavingServiceStatus().getUnderReplicatedBlocks())
// TODO use another property name for outOfServiceOnlyReplicas.
.put("decommissionOnlyReplicas", .put("decommissionOnlyReplicas",
node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas()) node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas())
.put("underReplicateInOpenFiles", .put("underReplicateInOpenFiles",
@ -5535,6 +5535,33 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
return JSON.toString(info); return JSON.toString(info);
} }
/**
* Returned information is a JSON representation of map with host name of
* nodes entering maintenance as the key and value as a map of various node
* attributes to its values.
*/
@Override // NameNodeMXBean
public String getEnteringMaintenanceNodes() {
final Map<String, Map<String, Object>> nodesMap =
new HashMap<String, Map<String, Object>>();
final List<DatanodeDescriptor> enteringMaintenanceNodeList =
blockManager.getDatanodeManager().getEnteringMaintenanceNodes();
for (DatanodeDescriptor node : enteringMaintenanceNodeList) {
Map<String, Object> attrMap = ImmutableMap
.<String, Object> builder()
.put("xferaddr", node.getXferAddr())
.put("underReplicatedBlocks",
node.getLeavingServiceStatus().getUnderReplicatedBlocks())
.put("maintenanceOnlyReplicas",
node.getLeavingServiceStatus().getOutOfServiceOnlyReplicas())
.put("underReplicateInOpenFiles",
node.getLeavingServiceStatus().getUnderReplicatedInOpenFiles())
.build();
nodesMap.put(node.getHostName() + ":" + node.getXferPort(), attrMap);
}
return JSON.toString(nodesMap);
}
private long getLastContact(DatanodeDescriptor alivenode) { private long getLastContact(DatanodeDescriptor alivenode) {
return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000; return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
} }

View File

@ -190,7 +190,14 @@ public interface NameNodeMXBean {
* @return the decommissioning node information * @return the decommissioning node information
*/ */
public String getDecomNodes(); public String getDecomNodes();
/**
* Gets the information on nodes entering maintenance.
*
* @return the information on nodes entering maintenance
*/
String getEnteringMaintenanceNodes();
/** /**
* Gets the cluster id. * Gets the cluster id.
* *

View File

@ -171,9 +171,10 @@
{/nn} {/nn}
{#fs} {#fs}
<tr><th><a href="#tab-datanode">Live Nodes</a></th><td>{NumLiveDataNodes} (Decommissioned: {NumDecomLiveDataNodes})</td></tr> <tr><th><a href="#tab-datanode">Live Nodes</a></th><td>{NumLiveDataNodes} (Decommissioned: {NumDecomLiveDataNodes}, In Maintenance: {NumInMaintenanceLiveDataNodes})</td></tr>
<tr><th><a href="#tab-datanode">Dead Nodes</a></th><td>{NumDeadDataNodes} (Decommissioned: {NumDecomDeadDataNodes})</td></tr> <tr><th><a href="#tab-datanode">Dead Nodes</a></th><td>{NumDeadDataNodes} (Decommissioned: {NumDecomDeadDataNodes}, In Maintenance: {NumInMaintenanceDeadDataNodes})</td></tr>
<tr><th><a href="#tab-datanode">Decommissioning Nodes</a></th><td>{NumDecommissioningDataNodes}</td></tr> <tr><th><a href="#tab-datanode">Decommissioning Nodes</a></th><td>{NumDecommissioningDataNodes}</td></tr>
<tr><th><a href="#tab-datanode">Entering Maintenance Nodes</a></th><td> {NumEnteringMaintenanceDataNodes}</td></tr>
<tr><th><a href="#tab-datanode-volume-failures">Total Datanode Volume Failures</a></th><td>{VolumeFailuresTotal} ({EstimatedCapacityLostTotal|fmt_bytes})</td></tr> <tr><th><a href="#tab-datanode-volume-failures">Total Datanode Volume Failures</a></th><td>{VolumeFailuresTotal} ({EstimatedCapacityLostTotal|fmt_bytes})</td></tr>
{@eq key=nnstat.State value="active"} {@eq key=nnstat.State value="active"}
<tr><th title="Excludes missing blocks.">Number of Under-Replicated Blocks</th><td>{UnderReplicatedBlocks}</td></tr> <tr><th title="Excludes missing blocks.">Number of Under-Replicated Blocks</th><td>{UnderReplicatedBlocks}</td></tr>
@ -295,6 +296,7 @@
<li class="dfshealth-node-icon dfshealth-node-down">Down</li> <li class="dfshealth-node-icon dfshealth-node-down">Down</li>
<li class="dfshealth-node-icon dfshealth-node-decommissioned">Decommissioned</li> <li class="dfshealth-node-icon dfshealth-node-decommissioned">Decommissioned</li>
<li class="dfshealth-node-icon dfshealth-node-down-decommissioned">Decommissioned &amp; dead</li> <li class="dfshealth-node-icon dfshealth-node-down-decommissioned">Decommissioned &amp; dead</li>
<li class="dfshealth-node-icon dfshealth-node-down-maintenance">In Maintenance &amp; dead</li>
</ul> </ul>
</div> </div>
<div class="page-header"><h1><small>In operation</small></h1></div> <div class="page-header"><h1><small>In operation</small></h1></div>
@ -344,6 +346,32 @@
</table> </table>
</small> </small>
<div class="page-header"><h1><small>Entering Maintenance</small></h1></div>
<small>
{?EnteringMaintenanceNodes}
<table class="table">
<thead>
<tr>
<th>Node</th>
<th>Under replicated blocks</th>
<th>Blocks with no live replicas</th>
<th>Under Replicated Blocks <br/>In files under construction</th>
</tr>
</thead>
{#EnteringMaintenanceNodes}
<tr>
<td>{name} ({xferaddr})</td>
<td>{underReplicatedBlocks}</td>
<td>{maintenanceOnlyReplicas}</td>
<td>{underReplicateInOpenFiles}</td>
</tr>
{/EnteringMaintenanceNodes}
</table>
{:else}
No nodes are entering maintenance.
{/EnteringMaintenanceNodes}
</small>
<div class="page-header"><h1><small>Decommissioning</small></h1></div> <div class="page-header"><h1><small>Decommissioning</small></h1></div>
<small> <small>
{?DecomNodes} {?DecomNodes}

View File

@ -223,17 +223,23 @@
if (n.adminState === "In Service") { if (n.adminState === "In Service") {
n.state = "alive"; n.state = "alive";
} else if (nodes[i].adminState === "Decommission In Progress") { } else if (nodes[i].adminState === "Decommission In Progress") {
n.state = "decommisioning"; n.state = "decommissioning";
} else if (nodes[i].adminState === "Decommissioned") { } else if (nodes[i].adminState === "Decommissioned") {
n.state = "decommissioned"; n.state = "decommissioned";
} else if (nodes[i].adminState === "Entering Maintenance") {
n.state = "entering-maintenance";
} else if (nodes[i].adminState === "In Maintenance") {
n.state = "in-maintenance";
} }
} }
} }
function augment_dead_nodes(nodes) { function augment_dead_nodes(nodes) {
for (var i = 0, e = nodes.length; i < e; ++i) { for (var i = 0, e = nodes.length; i < e; ++i) {
if (nodes[i].decommissioned) { if (nodes[i].adminState === "Decommissioned") {
nodes[i].state = "down-decommissioned"; nodes[i].state = "down-decommissioned";
} else if (nodes[i].adminState === "In Maintenance") {
nodes[i].state = "down-maintenance";
} else { } else {
nodes[i].state = "down"; nodes[i].state = "down";
} }
@ -245,6 +251,7 @@
r.DeadNodes = node_map_to_array(JSON.parse(r.DeadNodes)); r.DeadNodes = node_map_to_array(JSON.parse(r.DeadNodes));
augment_dead_nodes(r.DeadNodes); augment_dead_nodes(r.DeadNodes);
r.DecomNodes = node_map_to_array(JSON.parse(r.DecomNodes)); r.DecomNodes = node_map_to_array(JSON.parse(r.DecomNodes));
r.EnteringMaintenanceNodes = node_map_to_array(JSON.parse(r.EnteringMaintenanceNodes));
return r; return r;
} }

View File

@ -37,8 +37,10 @@ import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
import org.apache.hadoop.hdfs.server.blockmanagement.CombinedHostFileManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.blockmanagement.HostConfigManager;
import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
@ -50,10 +52,13 @@ import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.io.nativeio.NativeIO.POSIX.NoMlockCacheManipulator; import org.apache.hadoop.io.nativeio.NativeIO.POSIX.NoMlockCacheManipulator;
import org.apache.hadoop.net.ServerSocketUtil; import org.apache.hadoop.net.ServerSocketUtil;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.VersionInfo; import org.apache.hadoop.util.VersionInfo;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.eclipse.jetty.util.ajax.JSON; import org.eclipse.jetty.util.ajax.JSON;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.management.MBeanServer; import javax.management.MBeanServer;
import javax.management.ObjectName; import javax.management.ObjectName;
@ -64,6 +69,7 @@ import java.net.BindException;
import java.net.URI; import java.net.URI;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -80,6 +86,9 @@ import static org.junit.Assert.fail;
*/ */
public class TestNameNodeMXBean { public class TestNameNodeMXBean {
private static final Logger LOG =
LoggerFactory.getLogger(TestNameNodeMXBean.class);
/** /**
* Used to assert equality between doubles * Used to assert equality between doubles
*/ */
@ -180,10 +189,10 @@ public class TestNameNodeMXBean {
assertFalse(xferAddr.equals(dnXferAddrInMaintenance) ^ inMaintenance); assertFalse(xferAddr.equals(dnXferAddrInMaintenance) ^ inMaintenance);
} }
assertEquals(fsn.getLiveNodes(), alivenodeinfo); assertEquals(fsn.getLiveNodes(), alivenodeinfo);
// get attribute deadnodeinfo // get attributes DeadNodes
String deadnodeinfo = (String) (mbs.getAttribute(mxbeanName, String deadNodeInfo = (String) (mbs.getAttribute(mxbeanName,
"DeadNodes")); "DeadNodes"));
assertEquals(fsn.getDeadNodes(), deadnodeinfo); assertEquals(fsn.getDeadNodes(), deadNodeInfo);
// get attribute NodeUsage // get attribute NodeUsage
String nodeUsage = (String) (mbs.getAttribute(mxbeanName, String nodeUsage = (String) (mbs.getAttribute(mxbeanName,
"NodeUsage")); "NodeUsage"));
@ -295,16 +304,16 @@ public class TestNameNodeMXBean {
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS); Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
} }
// get attribute deadnodeinfo // get attribute DeadNodes
String deadnodeinfo = (String) (mbs.getAttribute(mxbeanName, String deadNodeInfo = (String) (mbs.getAttribute(mxbeanName,
"DeadNodes")); "DeadNodes"));
assertEquals(fsn.getDeadNodes(), deadnodeinfo); assertEquals(fsn.getDeadNodes(), deadNodeInfo);
Map<String, Map<String, Object>> deadNodes = Map<String, Map<String, Object>> deadNodes =
(Map<String, Map<String, Object>>) JSON.parse(deadnodeinfo); (Map<String, Map<String, Object>>) JSON.parse(deadNodeInfo);
assertTrue(deadNodes.size() > 0); assertTrue(deadNodes.size() > 0);
for (Map<String, Object> deadNode : deadNodes.values()) { for (Map<String, Object> deadNode : deadNodes.values()) {
assertTrue(deadNode.containsKey("lastContact")); assertTrue(deadNode.containsKey("lastContact"));
assertTrue(deadNode.containsKey("decommissioned")); assertTrue(deadNode.containsKey("adminState"));
assertTrue(deadNode.containsKey("xferaddr")); assertTrue(deadNode.containsKey("xferaddr"));
} }
} finally { } finally {
@ -415,6 +424,106 @@ public class TestNameNodeMXBean {
} }
} }
@Test (timeout = 120000)
public void testMaintenanceNodes() throws Exception {
LOG.info("Starting testMaintenanceNodes");
int expirationInMs = 30 * 1000;
Configuration conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
expirationInMs);
conf.setClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY,
CombinedHostFileManager.class, HostConfigManager.class);
MiniDFSCluster cluster = null;
HostsFileWriter hostsFileWriter = new HostsFileWriter();
hostsFileWriter.initialize(conf, "temp/TestNameNodeMXBean");
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
cluster.waitActive();
FSNamesystem fsn = cluster.getNameNode().namesystem;
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
ObjectName mxbeanName = new ObjectName(
"Hadoop:service=NameNode,name=NameNodeInfo");
List<String> hosts = new ArrayList<>();
for(DataNode dn : cluster.getDataNodes()) {
hosts.add(dn.getDisplayName());
}
hostsFileWriter.initIncludeHosts(hosts.toArray(
new String[hosts.size()]));
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
// 1. Verify nodes for DatanodeReportType.LIVE state
String liveNodesInfo = (String) (mbs.getAttribute(mxbeanName,
"LiveNodes"));
LOG.info("Live Nodes: " + liveNodesInfo);
Map<String, Map<String, Object>> liveNodes =
(Map<String, Map<String, Object>>) JSON.parse(liveNodesInfo);
assertEquals(fsn.getLiveNodes(), liveNodesInfo);
assertEquals(fsn.getNumLiveDataNodes(), liveNodes.size());
for (Map<String, Object> liveNode : liveNodes.values()) {
assertTrue(liveNode.containsKey("lastContact"));
assertTrue(liveNode.containsKey("xferaddr"));
}
// Add the 1st DataNode to Maintenance list
Map<String, Long> maintenanceNodes = new HashMap<>();
maintenanceNodes.put(cluster.getDataNodes().get(0).getDisplayName(),
Time.monotonicNow() + expirationInMs);
hostsFileWriter.initOutOfServiceHosts(null, maintenanceNodes);
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
boolean recheck = true;
while (recheck) {
// 2. Verify nodes for DatanodeReportType.ENTERING_MAINTENANCE state
String enteringMaintenanceNodesInfo =
(String) (mbs.getAttribute(mxbeanName, "EnteringMaintenanceNodes"));
Map<String, Map<String, Object>> enteringMaintenanceNodes =
(Map<String, Map<String, Object>>) JSON.parse(
enteringMaintenanceNodesInfo);
if (enteringMaintenanceNodes.size() <= 0) {
LOG.info("Waiting for a node to Enter Maintenance state!");
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
continue;
}
LOG.info("Nodes entering Maintenance: " + enteringMaintenanceNodesInfo);
recheck = false;
assertEquals(fsn.getEnteringMaintenanceNodes(),
enteringMaintenanceNodesInfo);
assertEquals(fsn.getNumEnteringMaintenanceDataNodes(),
enteringMaintenanceNodes.size());
assertEquals(0, fsn.getNumInMaintenanceLiveDataNodes());
assertEquals(0, fsn.getNumInMaintenanceDeadDataNodes());
}
// Wait for the DecommissionManager to complete check
// and perform state transition
while (fsn.getNumInMaintenanceLiveDataNodes() != 1) {
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
}
// 3. Verify nodes for AdminStates.IN_MAINTENANCE state
String enteringMaintenanceNodesInfo =
(String) (mbs.getAttribute(mxbeanName, "EnteringMaintenanceNodes"));
Map<String, Map<String, Object>> enteringMaintenanceNodes =
(Map<String, Map<String, Object>>) JSON.parse(
enteringMaintenanceNodesInfo);
assertEquals(0, enteringMaintenanceNodes.size());
assertEquals(fsn.getEnteringMaintenanceNodes(),
enteringMaintenanceNodesInfo);
assertEquals(1, fsn.getNumInMaintenanceLiveDataNodes());
assertEquals(0, fsn.getNumInMaintenanceDeadDataNodes());
} finally {
if (cluster != null) {
cluster.shutdown();
}
hostsFileWriter.cleanup();
}
}
@Test(timeout=120000) @Test(timeout=120000)
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public void testTopUsers() throws Exception { public void testTopUsers() throws Exception {