From cd417f17aea693b8985694287bc9a3a579a529a4 Mon Sep 17 00:00:00 2001 From: Eric Badger Date: Wed, 17 Mar 2021 18:16:59 +0000 Subject: [PATCH] YARN-10688. ClusterMetrics should support GPU capacity related metrics.. Contributed by Qi Zhu. (cherry picked from commit 49f89f1d3de66f3bb4db5952e8873432ba62f71a) --- .../resourcemanager/ClusterMetrics.java | 27 +++++++- .../TestCSAllocateCustomResource.java | 64 +++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java index 37f4ec436da..7fe5cc9703b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java @@ -34,6 +34,8 @@ import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.util.resource.ResourceUtils; @InterfaceAudience.Private @Metrics(context="yarn") @@ -56,13 +58,14 @@ public class ClusterMetrics { @Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores; @Metric("Memory Capability") MutableGaugeLong capabilityMB; @Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores; + @Metric("GPU Capability") MutableGaugeLong capabilityGPUs; private static final MetricsInfo RECORD_INFO = info("ClusterMetrics", "Metrics for the Yarn Cluster"); private static volatile ClusterMetrics INSTANCE = null; private static MetricsRegistry registry; - + public static ClusterMetrics getMetrics() { if(!isInitialized.get()){ synchronized (ClusterMetrics.class) { @@ -206,10 +209,24 @@ public class ClusterMetrics { return capabilityVirtualCores.value(); } + public long getCapabilityGPUs() { + if (capabilityGPUs == null) { + return 0; + } + + return capabilityGPUs.value(); + } + public void incrCapability(Resource res) { if (res != null) { capabilityMB.incr(res.getMemorySize()); capabilityVirtualCores.incr(res.getVirtualCores()); + Integer gpuIndex = ResourceUtils.getResourceTypeIndex() + .get(ResourceInformation.GPU_URI); + if (gpuIndex != null) { + capabilityGPUs.incr(res. + getResourceValue(ResourceInformation.GPU_URI)); + } } } @@ -217,6 +234,12 @@ public class ClusterMetrics { if (res != null) { capabilityMB.decr(res.getMemorySize()); capabilityVirtualCores.decr(res.getVirtualCores()); + Integer gpuIndex = ResourceUtils.getResourceTypeIndex() + .get(ResourceInformation.GPU_URI); + if (gpuIndex != null) { + capabilityGPUs.decr(res. + getResourceValue(ResourceInformation.GPU_URI)); + } } } @@ -251,4 +274,4 @@ public class ClusterMetrics { public void incrUtilizedVirtualCores(long delta) { utilizedVirtualCores.incr(delta); } -} +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java index 65473b9eea7..d6f15446441 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java @@ -22,18 +22,22 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.resourcemanager.ClusterMetrics; import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData; import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter; +import org.apache.hadoop.yarn.server.resourcemanager.MockNodes; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; @@ -47,8 +51,12 @@ import org.junit.Test; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB; +import static org.junit.Assert.assertEquals; /** * Test case for custom resource container allocation. @@ -64,6 +72,9 @@ public class TestCSAllocateCustomResource { private final int g = 1024; + private ClusterNodeTracker nodeTracker; + private ClusterMetrics metrics; + @Before public void setUp() throws Exception { conf = new YarnConfiguration(); @@ -182,4 +193,57 @@ public class TestCSAllocateCustomResource { .getResourceValue("yarn.io/gpu")); rm.close(); } + + @Test + public void testClusterMetricsWithGPU() + throws Exception { + metrics = ClusterMetrics.getMetrics(); + // reset resource types + ResourceUtils.resetResourceTypes(); + String resourceTypesFileName = "resource-types-test.xml"; + File source = new File( + conf.getClassLoader().getResource(resourceTypesFileName).getFile()); + resourceTypesFile = new File(source.getParent(), "resource-types.xml"); + FileUtils.copyFile(source, resourceTypesFile); + + CapacitySchedulerConfiguration newConf = + (CapacitySchedulerConfiguration) TestUtils + .getConfigurationWithMultipleQueues(conf); + newConf.setClass(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, + DominantResourceCalculator.class, ResourceCalculator.class); + //start RM + MockRM rm = new MockRM(newConf); + rm.start(); + + nodeTracker = new ClusterNodeTracker<>(); + MockNodes.resetHostIds(); + Resource nodeResource = Resource.newInstance(4096, 4, + Collections.singletonMap(GPU_URI, 4L)); + List rmNodes = + MockNodes.newNodes(2, 4, nodeResource); + for (RMNode rmNode : rmNodes) { + nodeTracker.addNode(new FiCaSchedulerNode(rmNode, false)); + } + + // Check GPU inc related cluster metrics. + assertEquals("Cluster Capability Memory incorrect", + metrics.getCapabilityMB(), (4096 * 8)); + assertEquals("Cluster Capability Vcores incorrect", + metrics.getCapabilityVirtualCores(), 4 * 8); + assertEquals("Cluster Capability GPUs incorrect", + metrics.getCapabilityGPUs(), 4 * 8); + + for (RMNode rmNode : rmNodes) { + nodeTracker.removeNode(rmNode.getNodeID()); + } + + // Check GPU dec related cluster metrics. + assertEquals("Cluster Capability Memory incorrect", + metrics.getCapabilityMB(), 0); + assertEquals("Cluster Capability Vcores incorrect", + metrics.getCapabilityVirtualCores(), 0); + assertEquals("Cluster Capability GPUs incorrect", + metrics.getCapabilityGPUs(), 0); + ClusterMetrics.destroy(); + } }