YARN-10692. Add Node GPU Utilization and apply to NodeMetrics. Contributed by Qi Zhu.

This commit is contained in:
Peter Bacsko 2021-03-18 12:45:28 +01:00
parent a5745711dd
commit 38495af325
6 changed files with 112 additions and 5 deletions

View File

@ -20,8 +20,11 @@ package org.apache.hadoop.yarn.server.nodemanager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.slf4j.Logger;
@ -46,6 +49,10 @@ public class NodeResourceMonitorImpl extends AbstractService implements
/** Resource calculator. */
private ResourceCalculatorPlugin resourceCalculatorPlugin;
/** Gpu related plugin. */
private GpuResourcePlugin gpuResourcePlugin;
private GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler;
/** Current <em>resource utilization</em> of the node. */
private ResourceUtilization nodeUtilization =
ResourceUtilization.newInstance(0, 0, 0f);
@ -72,6 +79,18 @@ public class NodeResourceMonitorImpl extends AbstractService implements
this.resourceCalculatorPlugin =
ResourceCalculatorPlugin.getNodeResourceMonitorPlugin(conf);
if (nmContext.getResourcePluginManager() != null) {
this.gpuResourcePlugin =
(GpuResourcePlugin)nmContext.getResourcePluginManager().
getNameToPlugins().get(ResourceInformation.GPU_URI);
if (gpuResourcePlugin != null) {
this.gpuNodeResourceUpdateHandler =
(GpuNodeResourceUpdateHandler)gpuResourcePlugin.
getNodeResourceHandlerInstance();
}
}
LOG.info(" Using ResourceCalculatorPlugin : "
+ this.resourceCalculatorPlugin);
}
@ -152,6 +171,14 @@ public class NodeResourceMonitorImpl extends AbstractService implements
(int) (vmem >> 20), // B -> MB
vcores); // Used Virtual Cores
float nodeGpuUtilization = 0F;
try {
nodeGpuUtilization =
gpuNodeResourceUpdateHandler.getNodeGpuUtilization();
} catch (Exception e) {
LOG.error("Get Node GPU Utilization error: " + e);
}
// Publish the node utilization metrics to node manager
// metrics system.
NodeManagerMetrics nmMetrics = nmContext.getNodeManagerMetrics();
@ -159,6 +186,7 @@ public class NodeResourceMonitorImpl extends AbstractService implements
nmMetrics.setNodeUsedMemGB(nodeUtilization.getPhysicalMemory());
nmMetrics.setNodeUsedVMemGB(nodeUtilization.getVirtualMemory());
nmMetrics.setNodeCpuUtilization(nodeUtilization.getCPU());
nmMetrics.setNodeGpuUtilization(nodeGpuUtilization);
}
try {

View File

@ -26,12 +26,14 @@ import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
@ -76,4 +78,20 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
res.setResourceValue(GPU_URI, nUsableGpus);
}
public float getNodeGpuUtilization() throws Exception{
List<PerGpuDeviceInformation> gpuList =
gpuDiscoverer.getGpuDeviceInformation().getGpus();
Float totalGpuUtilization = 0F;
if (gpuList != null &&
gpuList.size() != 0) {
totalGpuUtilization = gpuList
.stream()
.map(g -> g.getGpuUtilizations().getOverallGpuUtilization())
.collect(Collectors.summingDouble(Float::floatValue))
.floatValue() / gpuList.size();
}
return totalGpuUtilization;
}
}

View File

@ -98,6 +98,8 @@ public class NodeManagerMetrics {
MutableGaugeInt nodeUsedVMemGB;
@Metric("Current CPU utilization")
MutableGaugeFloat nodeCpuUtilization;
@Metric("Current GPU utilization")
MutableGaugeFloat nodeGpuUtilization;
@Metric("Missed localization requests in bytes")
MutableCounterLong localizedCacheMissBytes;
@ -428,6 +430,14 @@ public class NodeManagerMetrics {
this.nodeCpuUtilization.set(cpuUtilization);
}
public void setNodeGpuUtilization(float nodeGpuUtilization) {
this.nodeGpuUtilization.set(nodeGpuUtilization);
}
public float getNodeGpuUtilization() {
return nodeGpuUtilization.value();
}
private void updateLocalizationHitRatios() {
updateLocalizationHitRatio(localizedCacheHitBytes, localizedCacheMissBytes,
localizedCacheHitBytesRatio);

View File

@ -437,14 +437,16 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
waitForNMContainerState(cm, cid,
org.apache.hadoop.yarn.server.nodemanager
.containermanager.container.ContainerState.RUNNING);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, 1, 1, 1, 9, 1, 7);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
1, 1, 1, 9, 1, 7, 0F);
// restart and verify metrics could be recovered
cm.stop();
DefaultMetricsSystem.shutdown();
metrics = NodeManagerMetrics.create();
metrics.addResource(Resource.newInstance(10240, 8));
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 8);
TestNodeManagerMetrics.checkMetrics(0, 0, 0, 0, 0, 0,
0, 0, 10, 0, 8, 0F);
context = createContext(conf, stateStore);
cm = createContainerManager(context, delSrvc);
cm.init(conf);
@ -452,7 +454,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0, 1, 1, 1, 9, 1, 7);
TestNodeManagerMetrics.checkMetrics(1, 0, 0, 0, 0,
1, 1, 1, 9, 1, 7, 0F);
cm.stop();
}

View File

@ -21,11 +21,13 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugi
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.thirdparty.com.google.common.collect.Lists;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuUtilizations;
import org.junit.Assert;
import org.junit.Test;
import java.util.List;
@ -122,4 +124,45 @@ public class TestGpuResourcePlugin {
(NMGpuResourceInfo) target.getNMResourceInfo();
Assert.assertNull(resourceInfo.getGpuDeviceInformation());
}
@Test
public void testNodeGPUUtilization()
throws Exception {
GpuDiscoverer gpuDiscoverer = createNodeGPUUtilizationDiscoverer();
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
new GpuNodeResourceUpdateHandler(gpuDiscoverer, new Configuration());
Assert.assertEquals(0.5F,
gpuNodeResourceUpdateHandler.getNodeGpuUtilization(), 1e-6);
}
private GpuDiscoverer createNodeGPUUtilizationDiscoverer()
throws YarnException {
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
PerGpuDeviceInformation gpu1 =
new PerGpuDeviceInformation();
PerGpuUtilizations perGpuUtilizations1 =
new PerGpuUtilizations();
perGpuUtilizations1.setOverallGpuUtilization(0.4F);
gpu1.setGpuUtilizations(perGpuUtilizations1);
PerGpuDeviceInformation gpu2 =
new PerGpuDeviceInformation();
PerGpuUtilizations perGpuUtilizations2 =
new PerGpuUtilizations();
perGpuUtilizations2.setOverallGpuUtilization(0.6F);
gpu2.setGpuUtilizations(perGpuUtilizations2);
List<PerGpuDeviceInformation> gpus = Lists.newArrayList();
gpus.add(gpu1);
gpus.add(gpu2);
GpuDeviceInformation gpuDeviceInfo = new GpuDeviceInformation();
gpuDeviceInfo.setGpus(gpus);
when(gpuDiscoverer.getGpuDeviceInformation()).thenReturn(gpuDeviceInfo);
return gpuDiscoverer;
}
}

View File

@ -100,11 +100,15 @@ public class TestNodeManagerMetrics {
metrics.addContainerLaunchDuration(1);
Assert.assertTrue(metrics.containerLaunchDuration.changed());
// Set node gpu utilization
metrics.setNodeGpuUtilization(35.5F);
// availableGB is expected to be floored,
// while allocatedGB is expected to be ceiled.
// allocatedGB: 3.75GB allocated memory is shown as 4GB
// availableGB: 4.25GB available memory is shown as 4GB
checkMetrics(10, 1, 1, 1, 1, 1, 4, 7, 4, 13, 3);
checkMetrics(10, 1, 1, 1, 1,
1, 4, 7, 4, 13, 3, 35.5F);
// Update resource and check available resource again
metrics.addResource(total);
@ -116,7 +120,7 @@ public class TestNodeManagerMetrics {
public static void checkMetrics(int launched, int completed, int failed,
int killed, int initing, int running, int allocatedGB,
int allocatedContainers, int availableGB, int allocatedVCores,
int availableVCores) {
int availableVCores, Float nodeGpuUtilization) {
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
assertCounter("ContainersLaunched", launched, rb);
assertCounter("ContainersCompleted", completed, rb);
@ -129,6 +133,7 @@ public class TestNodeManagerMetrics {
assertGauge("AllocatedContainers", allocatedContainers, rb);
assertGauge("AvailableGB", availableGB, rb);
assertGauge("AvailableVCores",availableVCores, rb);
assertGauge("NodeGpuUtilization", nodeGpuUtilization, rb);
}
}