YARN-10688. ClusterMetrics should support GPU capacity related metrics.. Contributed by Qi Zhu.
(cherry picked from commit 49f89f1d3d
)
This commit is contained in:
parent
17307ee88d
commit
cd417f17ae
|
@ -34,6 +34,8 @@ import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
|
||||||
import org.apache.hadoop.metrics2.lib.MutableRate;
|
import org.apache.hadoop.metrics2.lib.MutableRate;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
|
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
|
||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
@Metrics(context="yarn")
|
@Metrics(context="yarn")
|
||||||
|
@ -56,6 +58,7 @@ public class ClusterMetrics {
|
||||||
@Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores;
|
@Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores;
|
||||||
@Metric("Memory Capability") MutableGaugeLong capabilityMB;
|
@Metric("Memory Capability") MutableGaugeLong capabilityMB;
|
||||||
@Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores;
|
@Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores;
|
||||||
|
@Metric("GPU Capability") MutableGaugeLong capabilityGPUs;
|
||||||
|
|
||||||
private static final MetricsInfo RECORD_INFO = info("ClusterMetrics",
|
private static final MetricsInfo RECORD_INFO = info("ClusterMetrics",
|
||||||
"Metrics for the Yarn Cluster");
|
"Metrics for the Yarn Cluster");
|
||||||
|
@ -206,10 +209,24 @@ public class ClusterMetrics {
|
||||||
return capabilityVirtualCores.value();
|
return capabilityVirtualCores.value();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getCapabilityGPUs() {
|
||||||
|
if (capabilityGPUs == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return capabilityGPUs.value();
|
||||||
|
}
|
||||||
|
|
||||||
public void incrCapability(Resource res) {
|
public void incrCapability(Resource res) {
|
||||||
if (res != null) {
|
if (res != null) {
|
||||||
capabilityMB.incr(res.getMemorySize());
|
capabilityMB.incr(res.getMemorySize());
|
||||||
capabilityVirtualCores.incr(res.getVirtualCores());
|
capabilityVirtualCores.incr(res.getVirtualCores());
|
||||||
|
Integer gpuIndex = ResourceUtils.getResourceTypeIndex()
|
||||||
|
.get(ResourceInformation.GPU_URI);
|
||||||
|
if (gpuIndex != null) {
|
||||||
|
capabilityGPUs.incr(res.
|
||||||
|
getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -217,6 +234,12 @@ public class ClusterMetrics {
|
||||||
if (res != null) {
|
if (res != null) {
|
||||||
capabilityMB.decr(res.getMemorySize());
|
capabilityMB.decr(res.getMemorySize());
|
||||||
capabilityVirtualCores.decr(res.getVirtualCores());
|
capabilityVirtualCores.decr(res.getVirtualCores());
|
||||||
|
Integer gpuIndex = ResourceUtils.getResourceTypeIndex()
|
||||||
|
.get(ResourceInformation.GPU_URI);
|
||||||
|
if (gpuIndex != null) {
|
||||||
|
capabilityGPUs.decr(res.
|
||||||
|
getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,18 +22,22 @@ import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.ClusterMetrics;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
|
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
|
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
|
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData;
|
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmissionData;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter;
|
import org.apache.hadoop.yarn.server.resourcemanager.MockRMAppSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.MockNodes;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles;
|
import org.apache.hadoop.yarn.server.resourcemanager.resource.TestResourceProfiles;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
|
||||||
import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
|
import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
|
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
|
||||||
|
@ -47,8 +51,12 @@ import org.junit.Test;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
|
||||||
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB;
|
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration.MAXIMUM_ALLOCATION_MB;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for custom resource container allocation.
|
* Test case for custom resource container allocation.
|
||||||
|
@ -64,6 +72,9 @@ public class TestCSAllocateCustomResource {
|
||||||
|
|
||||||
private final int g = 1024;
|
private final int g = 1024;
|
||||||
|
|
||||||
|
private ClusterNodeTracker<FiCaSchedulerNode> nodeTracker;
|
||||||
|
private ClusterMetrics metrics;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
conf = new YarnConfiguration();
|
conf = new YarnConfiguration();
|
||||||
|
@ -182,4 +193,57 @@ public class TestCSAllocateCustomResource {
|
||||||
.getResourceValue("yarn.io/gpu"));
|
.getResourceValue("yarn.io/gpu"));
|
||||||
rm.close();
|
rm.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testClusterMetricsWithGPU()
|
||||||
|
throws Exception {
|
||||||
|
metrics = ClusterMetrics.getMetrics();
|
||||||
|
// reset resource types
|
||||||
|
ResourceUtils.resetResourceTypes();
|
||||||
|
String resourceTypesFileName = "resource-types-test.xml";
|
||||||
|
File source = new File(
|
||||||
|
conf.getClassLoader().getResource(resourceTypesFileName).getFile());
|
||||||
|
resourceTypesFile = new File(source.getParent(), "resource-types.xml");
|
||||||
|
FileUtils.copyFile(source, resourceTypesFile);
|
||||||
|
|
||||||
|
CapacitySchedulerConfiguration newConf =
|
||||||
|
(CapacitySchedulerConfiguration) TestUtils
|
||||||
|
.getConfigurationWithMultipleQueues(conf);
|
||||||
|
newConf.setClass(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS,
|
||||||
|
DominantResourceCalculator.class, ResourceCalculator.class);
|
||||||
|
//start RM
|
||||||
|
MockRM rm = new MockRM(newConf);
|
||||||
|
rm.start();
|
||||||
|
|
||||||
|
nodeTracker = new ClusterNodeTracker<>();
|
||||||
|
MockNodes.resetHostIds();
|
||||||
|
Resource nodeResource = Resource.newInstance(4096, 4,
|
||||||
|
Collections.singletonMap(GPU_URI, 4L));
|
||||||
|
List<RMNode> rmNodes =
|
||||||
|
MockNodes.newNodes(2, 4, nodeResource);
|
||||||
|
for (RMNode rmNode : rmNodes) {
|
||||||
|
nodeTracker.addNode(new FiCaSchedulerNode(rmNode, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check GPU inc related cluster metrics.
|
||||||
|
assertEquals("Cluster Capability Memory incorrect",
|
||||||
|
metrics.getCapabilityMB(), (4096 * 8));
|
||||||
|
assertEquals("Cluster Capability Vcores incorrect",
|
||||||
|
metrics.getCapabilityVirtualCores(), 4 * 8);
|
||||||
|
assertEquals("Cluster Capability GPUs incorrect",
|
||||||
|
metrics.getCapabilityGPUs(), 4 * 8);
|
||||||
|
|
||||||
|
for (RMNode rmNode : rmNodes) {
|
||||||
|
nodeTracker.removeNode(rmNode.getNodeID());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check GPU dec related cluster metrics.
|
||||||
|
assertEquals("Cluster Capability Memory incorrect",
|
||||||
|
metrics.getCapabilityMB(), 0);
|
||||||
|
assertEquals("Cluster Capability Vcores incorrect",
|
||||||
|
metrics.getCapabilityVirtualCores(), 0);
|
||||||
|
assertEquals("Cluster Capability GPUs incorrect",
|
||||||
|
metrics.getCapabilityGPUs(), 0);
|
||||||
|
ClusterMetrics.destroy();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue