From 49480b87e0c197396d0eb04f62bf9ec50bbe3597 Mon Sep 17 00:00:00 2001 From: Zehao Chen Date: Mon, 15 Feb 2021 11:14:32 -0600 Subject: [PATCH] HDFS-15821. Add metrics for in-service datanodes (#2690). Contributed by Zehao Chen. (cherry picked from commit 07a4220cd27c69b86b837e8da320bad0031f7895) --- .../metrics/NamenodeBeanMetrics.java | 5 + .../hdfs/server/namenode/FSNamesystem.java | 13 +++ .../namenode/metrics/FSNamesystemMBean.java | 6 ++ .../server/namenode/TestNameNodeMXBean.java | 97 +++++++++++++++++++ 4 files changed, 121 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java index 0ca5f737dd4..2c6bea9bb92 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java @@ -589,6 +589,11 @@ public class NamenodeBeanMetrics return 0; } + @Override + public int getNumInServiceLiveDataNodes() { + return 0; + } + @Override public int getVolumeFailuresTotal() { return 0; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 5abbc1b6d79..7d7b9abc5cb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -5188,6 +5188,19 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, return deadDecommissioned; } + @Override // FSNamesystemMBean + @Metric({"NumInServiceLiveDataNodes", + "Number of live datanodes which are currently in service"}) + public int getNumInServiceLiveDataNodes() { + final List live = new ArrayList(); + getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true); + int liveInService = live.size(); + for (DatanodeDescriptor node : live) { + liveInService -= node.isInMaintenance() ? 1 : 0; + } + return liveInService; + } + @Override // FSNamesystemMBean @Metric({"VolumeFailuresTotal", "Total number of volume failures across all Datanodes"}) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java index c25bafd48d7..5b1b2138700 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java @@ -150,6 +150,12 @@ public interface FSNamesystemMBean { */ public int getNumDecomDeadDataNodes(); + /** + * @return Number of in-service data nodes, where NumInServiceDataNodes = + * NumLiveDataNodes - NumDecomLiveDataNodes - NumInMaintenanceLiveDataNodes + */ + int getNumInServiceLiveDataNodes(); + /** * Number of failed data volumes across all live data nodes. * @return number of failed data volumes across all live data nodes diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeMXBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeMXBean.java index bfb3c49744b..6e049170c0d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeMXBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeMXBean.java @@ -430,6 +430,103 @@ public class TestNameNodeMXBean { } } + @Test(timeout = 120000) + public void testInServiceNodes() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); + conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, + 30); + conf.setClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY, + CombinedHostFileManager.class, HostConfigManager.class); + MiniDFSCluster cluster = null; + HostsFileWriter hostsFileWriter = new HostsFileWriter(); + hostsFileWriter.initialize(conf, "temp/TestInServiceNodes"); + + try { + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + cluster.waitActive(); + + final FSNamesystem fsn = cluster.getNameNode().namesystem; + final MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); + final ObjectName mxbeanName = new ObjectName( + "Hadoop:service=NameNode,name=FSNamesystem"); + + List hosts = new ArrayList<>(); + for (DataNode dn : cluster.getDataNodes()) { + hosts.add(dn.getDisplayName()); + } + hostsFileWriter.initIncludeHosts(hosts.toArray( + new String[hosts.size()])); + fsn.getBlockManager().getDatanodeManager().refreshNodes(conf); + + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + int numLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumLiveDataNodes"); + return numLiveDataNodes == 3; + } catch (Exception e) { + return false; + } + } + }, 1000, 60000); + + // Verify nodes + int numDecomLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumDecomLiveDataNodes"); + int numInMaintenanceLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumInMaintenanceLiveDataNodes"); + int numInServiceLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumInServiceLiveDataNodes"); + assertEquals(0, numDecomLiveDataNodes); + assertEquals(0, numInMaintenanceLiveDataNodes); + assertEquals(3, numInServiceLiveDataNodes); + + // Add 2 nodes to out-of-service list + ArrayList decomNodes = new ArrayList<>(); + decomNodes.add(cluster.getDataNodes().get(0).getDisplayName()); + + Map maintenanceNodes = new HashMap<>(); + final int expirationInMs = 30 * 1000; + maintenanceNodes.put(cluster.getDataNodes().get(1).getDisplayName(), + Time.now() + expirationInMs); + + hostsFileWriter.initOutOfServiceHosts(decomNodes, maintenanceNodes); + fsn.getBlockManager().getDatanodeManager().refreshNodes(conf); + + // Wait for the DatanodeAdminManager to complete check + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + int numLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumLiveDataNodes"); + int numDecomLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumDecomLiveDataNodes"); + int numInMaintenanceLiveDataNodes = (int) mbs.getAttribute( + mxbeanName, "NumInMaintenanceLiveDataNodes"); + return numLiveDataNodes == 3 && + numDecomLiveDataNodes == 1 && + numInMaintenanceLiveDataNodes == 1; + } catch (Exception e) { + return false; + } + } + }, 1000, 60000); + + // Verify nodes + numInServiceLiveDataNodes = (int) mbs.getAttribute(mxbeanName, + "NumInServiceLiveDataNodes"); + assertEquals(1, numInServiceLiveDataNodes); + } finally { + if (cluster != null) { + cluster.shutdown(); + } + hostsFileWriter.cleanup(); + } + } + @Test (timeout = 120000) public void testMaintenanceNodes() throws Exception { LOG.info("Starting testMaintenanceNodes");