diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java index 764d80722ee..375543dd937 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java @@ -34,6 +34,8 @@ import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.yarn.api.records.Resource; import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.util.resource.ResourceUtils; @InterfaceAudience.Private @Metrics(context="yarn") @@ -56,13 +58,14 @@ public class ClusterMetrics { @Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores; @Metric("Memory Capability") MutableGaugeLong capabilityMB; @Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores; + @Metric("GPU Capability") MutableGaugeLong capabilityGPUs; private static final MetricsInfo RECORD_INFO = info("ClusterMetrics", "Metrics for the Yarn Cluster"); private static volatile ClusterMetrics INSTANCE = null; private static MetricsRegistry registry; - + public static ClusterMetrics getMetrics() { if(!isInitialized.get()){ synchronized (ClusterMetrics.class) { @@ -206,10 +209,24 @@ public class ClusterMetrics { return capabilityVirtualCores.value(); } + public long getCapabilityGPUs() { + if (capabilityGPUs == null) { + return 0; + } + + return capabilityGPUs.value(); + } + public void incrCapability(Resource res) { if (res != null) { capabilityMB.incr(res.getMemorySize()); capabilityVirtualCores.incr(res.getVirtualCores()); + Integer gpuIndex = ResourceUtils.getResourceTypeIndex() + .get(ResourceInformation.GPU_URI); + if (gpuIndex != null) { + capabilityGPUs.incr(res. + getResourceValue(ResourceInformation.GPU_URI)); + } } } @@ -217,6 +234,12 @@ public class ClusterMetrics { if (res != null) { capabilityMB.decr(res.getMemorySize()); capabilityVirtualCores.decr(res.getVirtualCores()); + Integer gpuIndex = ResourceUtils.getResourceTypeIndex() + .get(ResourceInformation.GPU_URI); + if (gpuIndex != null) { + capabilityGPUs.decr(res. + getResourceValue(ResourceInformation.GPU_URI)); + } } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java index 27de139297a..e62082a183c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCSAllocateCustomResource.java @@ -22,16 +22,20 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.resourcemanager.ClusterMetrics; import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.MockNodes; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; @@ -45,8 +49,12 @@ import org.junit.Test; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB; +import static org.junit.Assert.assertEquals; /** * Test case for custom resource container allocation. @@ -62,6 +70,9 @@ public class TestCSAllocateCustomResource { private final int g = 1024; + private ClusterNodeTracker nodeTracker; + private ClusterMetrics metrics; + @Before public void setUp() throws Exception { conf = new YarnConfiguration(); @@ -174,4 +185,57 @@ public class TestCSAllocateCustomResource { .getResourceValue("yarn.io/gpu")); rm.close(); } + + @Test + public void testClusterMetricsWithGPU() + throws Exception { + metrics = ClusterMetrics.getMetrics(); + // reset resource types + ResourceUtils.resetResourceTypes(); + String resourceTypesFileName = "resource-types-test.xml"; + File source = new File( + conf.getClassLoader().getResource(resourceTypesFileName).getFile()); + resourceTypesFile = new File(source.getParent(), "resource-types.xml"); + FileUtils.copyFile(source, resourceTypesFile); + + CapacitySchedulerConfiguration newConf = + (CapacitySchedulerConfiguration) TestUtils + .getConfigurationWithMultipleQueues(conf); + newConf.setClass(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, + DominantResourceCalculator.class, ResourceCalculator.class); + //start RM + MockRM rm = new MockRM(newConf); + rm.start(); + + nodeTracker = new ClusterNodeTracker<>(); + MockNodes.resetHostIds(); + Resource nodeResource = Resource.newInstance(4096, 4, + Collections.singletonMap(GPU_URI, 4L)); + List rmNodes = + MockNodes.newNodes(2, 4, nodeResource); + for (RMNode rmNode : rmNodes) { + nodeTracker.addNode(new FiCaSchedulerNode(rmNode, false)); + } + + // Check GPU inc related cluster metrics. + assertEquals("Cluster Capability Memory incorrect", + metrics.getCapabilityMB(), (4096 * 8)); + assertEquals("Cluster Capability Vcores incorrect", + metrics.getCapabilityVirtualCores(), 4 * 8); + assertEquals("Cluster Capability GPUs incorrect", + metrics.getCapabilityGPUs(), 4 * 8); + + for (RMNode rmNode : rmNodes) { + nodeTracker.removeNode(rmNode.getNodeID()); + } + + // Check GPU dec related cluster metrics. + assertEquals("Cluster Capability Memory incorrect", + metrics.getCapabilityMB(), 0); + assertEquals("Cluster Capability Vcores incorrect", + metrics.getCapabilityVirtualCores(), 0); + assertEquals("Cluster Capability GPUs incorrect", + metrics.getCapabilityGPUs(), 0); + ClusterMetrics.destroy(); + } }