HDDS-917. Expose NodeManagerMXBean as a MetricsSource. Contributed by Siddharth Wagle.

This commit is contained in:
Bharat Viswanadham 2019-03-14 12:30:06 -07:00
parent 688b177fc6
commit 091a664977
No known key found for this signature in database
GPG Key ID: 6A6F3FB121D2F77C
3 changed files with 114 additions and 6 deletions

View File

@ -102,7 +102,6 @@ public class SCMNodeManager implements NodeManager {
public SCMNodeManager(OzoneConfiguration conf, String clusterID,
StorageContainerManager scmManager, EventPublisher eventPublisher)
throws IOException {
this.metrics = SCMNodeMetrics.create();
this.nodeStateManager = new NodeStateManager(conf, eventPublisher);
this.clusterID = clusterID;
this.version = VersionInfo.getLatestVersion();
@ -110,6 +109,7 @@ public class SCMNodeManager implements NodeManager {
this.scmManager = scmManager;
LOG.info("Entering startup chill mode.");
registerMXBean();
this.metrics = SCMNodeMetrics.create(this);
}
private void registerMXBean() {
@ -118,7 +118,7 @@ public class SCMNodeManager implements NodeManager {
}
private void unregisterMXBean() {
if(this.nmInfoBean != null) {
if (this.nmInfoBean != null) {
MBeans.unregister(this.nmInfoBean);
this.nmInfoBean = null;
}

View File

@ -18,11 +18,24 @@
package org.apache.hadoop.hdds.scm.node;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONED;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DECOMMISSIONING;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.HEALTHY;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.STALE;
import java.util.Map;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsInfo;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.Interns;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
/**
@ -30,7 +43,7 @@ import org.apache.hadoop.metrics2.lib.MutableCounterLong;
*/
@InterfaceAudience.Private
@Metrics(about = "SCM NodeManager Metrics", context = "ozone")
public final class SCMNodeMetrics {
public final class SCMNodeMetrics implements MetricsSource {
private static final String SOURCE_NAME =
SCMNodeMetrics.class.getSimpleName();
@ -40,18 +53,26 @@ public final class SCMNodeMetrics {
private @Metric MutableCounterLong numNodeReportProcessed;
private @Metric MutableCounterLong numNodeReportProcessingFailed;
private final MetricsRegistry registry;
private final NodeManagerMXBean managerMXBean;
private final MetricsInfo recordInfo = Interns.info("SCMNodeManager",
"SCM NodeManager metrics");
/** Private constructor. */
private SCMNodeMetrics() { }
private SCMNodeMetrics(NodeManagerMXBean managerMXBean) {
this.managerMXBean = managerMXBean;
this.registry = new MetricsRegistry(recordInfo);
}
/**
* Create and returns SCMNodeMetrics instance.
*
* @return SCMNodeMetrics
*/
public static SCMNodeMetrics create() {
public static SCMNodeMetrics create(NodeManagerMXBean managerMXBean) {
MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME, "SCM NodeManager Metrics",
new SCMNodeMetrics());
new SCMNodeMetrics(managerMXBean));
}
/**
@ -90,4 +111,51 @@ public final class SCMNodeMetrics {
numNodeReportProcessingFailed.incr();
}
/**
* Get aggregated counter and gauage metrics.
*/
@Override
@SuppressWarnings("SuspiciousMethodCalls")
public void getMetrics(MetricsCollector collector, boolean all) {
Map<String, Integer> nodeCount = managerMXBean.getNodeCount();
Map<String, Long> nodeInfo = managerMXBean.getNodeInfo();
registry.snapshot(
collector.addRecord(registry.info()) // Add annotated ones first
.addGauge(Interns.info(
"HealthyNodes",
"Number of healthy datanodes"),
nodeCount.get(HEALTHY.toString()))
.addGauge(Interns.info("StaleNodes",
"Number of stale datanodes"),
nodeCount.get(STALE.toString()))
.addGauge(Interns.info("DeadNodes",
"Number of dead datanodes"),
nodeCount.get(DEAD.toString()))
.addGauge(Interns.info("DecommissioningNodes",
"Number of decommissioning datanodes"),
nodeCount.get(DECOMMISSIONING.toString()))
.addGauge(Interns.info("DecommissionedNodes",
"Number of decommissioned datanodes"),
nodeCount.get(DECOMMISSIONED.toString()))
.addGauge(Interns.info("DiskCapacity",
"Total disk capacity"),
nodeInfo.get("DISKCapacity"))
.addGauge(Interns.info("DiskUsed",
"Total disk capacity used"),
nodeInfo.get("DISKUsed"))
.addGauge(Interns.info("DiskRemaining",
"Total disk capacity remaining"),
nodeInfo.get("DISKRemaining"))
.addGauge(Interns.info("SSDCapacity",
"Total ssd capacity"),
nodeInfo.get("SSDCapacity"))
.addGauge(Interns.info("SSDUsed",
"Total ssd capacity used"),
nodeInfo.get("SSDUsed"))
.addGauge(Interns.info("SSDRemaining",
"Total disk capacity remaining"),
nodeInfo.get("SSDRemaining")),
all);
}
}

View File

@ -35,6 +35,7 @@ import org.junit.Before;
import org.junit.Test;
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
@ -128,6 +129,45 @@ public class TestSCMNodeMetrics {
getMetrics(SCMNodeMetrics.class.getSimpleName()));
}
/**
* Verify that datanode aggregated state and capacity metrics are reported.
*/
@Test
public void testNodeCountAndInfoMetricsReported() throws Exception {
HddsDatanodeService datanode = cluster.getHddsDatanodes().get(0);
StorageReportProto storageReport = TestUtils.createStorageReport(
datanode.getDatanodeDetails().getUuid(), "/tmp", 100, 10, 90, null);
NodeReportProto nodeReport = NodeReportProto.newBuilder()
.addStorageReport(storageReport).build();
datanode.getDatanodeStateMachine().getContext().addReport(nodeReport);
datanode.getDatanodeStateMachine().triggerHeartbeat();
// Give some time so that SCM receives and processes the heartbeat.
Thread.sleep(300L);
assertGauge("HealthyNodes", 1,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("StaleNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DeadNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DecommissioningNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DecommissionedNodes", 0,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DiskCapacity", 100L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DiskUsed", 10L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("DiskRemaining", 90L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("SSDCapacity", 0L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("SSDUsed", 0L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
assertGauge("SSDRemaining", 0L,
getMetrics(SCMNodeMetrics.class.getSimpleName()));
}
@After
public void teardown() {
cluster.shutdown();