YARN-10107. Fix GpuResourcePlugin#getNMResourceInfo to honor Auto Discovery Enabled

Contributed by Szilard Nemeth.
This commit is contained in:
Prabhu Joseph 2020-01-29 13:29:42 +05:30 committed by Prabhu Joseph
parent e578e52aae
commit 825db8fe2a
3 changed files with 89 additions and 16 deletions

View File

@ -136,7 +136,7 @@ public class GpuDiscoverer extends Configured {
return lastDiscoveredGpuInformation;
}
private boolean isAutoDiscoveryEnabled() {
boolean isAutoDiscoveryEnabled() {
String allowedDevicesStr = getConf().get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);

View File

@ -94,27 +94,29 @@ public class GpuResourcePlugin implements ResourcePlugin {
@Override
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
GpuDeviceInformation gpuDeviceInformation;
final GpuDeviceInformation gpuDeviceInformation;
//At this point the gpu plugin is already enabled
checkGpuResourceHandler();
if (gpuDiscoverer.isAutoDiscoveryEnabled()) {
//At this point the gpu plugin is already enabled
checkGpuResourceHandler();
checkErrorCount();
try{
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
numOfErrorExecutionSinceLastSucceed = 0;
} catch (YarnException e) {
LOG.error(e.getMessage(), e);
numOfErrorExecutionSinceLastSucceed++;
throw e;
checkErrorCount();
try{
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
numOfErrorExecutionSinceLastSucceed = 0;
} catch (YarnException e) {
LOG.error(e.getMessage(), e);
numOfErrorExecutionSinceLastSucceed++;
throw e;
}
} else {
gpuDeviceInformation = null;
}
GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
List<AssignedGpuDevice> assignedGpuDevices =
gpuResourceAllocator.getAssignedGpus();
return new NMGpuResourceInfo(gpuDeviceInformation, totalGpus,
assignedGpuDevices);
}

View File

@ -19,15 +19,38 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.google.common.collect.Lists;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.junit.Assert;
import org.junit.Test;
import java.util.List;
public class TestGpuResourcePlugin {
private GpuDiscoverer createMockDiscoverer() throws YarnException {
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
when(gpuDiscoverer.isAutoDiscoveryEnabled()).thenReturn(true);
PerGpuDeviceInformation gpu =
new PerGpuDeviceInformation();
gpu.setProductName("testGpu");
List<PerGpuDeviceInformation> gpus = Lists.newArrayList();
gpus.add(gpu);
GpuDeviceInformation gpuDeviceInfo = new GpuDeviceInformation();
gpuDeviceInfo.setGpus(gpus);
when(gpuDiscoverer.getGpuDeviceInformation()).thenReturn(gpuDeviceInfo);
return gpuDiscoverer;
}
@Test(expected = YarnException.class)
public void testResourceHandlerNotInitialized() throws YarnException {
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
mock(GpuNodeResourceUpdateHandler.class);
@ -39,7 +62,7 @@ public class TestGpuResourcePlugin {
@Test
public void testResourceHandlerIsInitialized() throws YarnException {
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
mock(GpuNodeResourceUpdateHandler.class);
@ -51,4 +74,52 @@ public class TestGpuResourcePlugin {
//Not throwing any exception
target.getNMResourceInfo();
}
@Test
public void testGetNMResourceInfoAutoDiscoveryEnabled()
throws YarnException {
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
mock(GpuNodeResourceUpdateHandler.class);
GpuResourcePlugin target =
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
target.createResourceHandler(null, null, null);
NMGpuResourceInfo resourceInfo =
(NMGpuResourceInfo) target.getNMResourceInfo();
Assert.assertNotNull("GpuDeviceInformation should not be null",
resourceInfo.getGpuDeviceInformation());
List<PerGpuDeviceInformation> gpus =
resourceInfo.getGpuDeviceInformation().getGpus();
Assert.assertNotNull("List of PerGpuDeviceInformation should not be null",
gpus);
Assert.assertEquals("List of PerGpuDeviceInformation should have a " +
"size of 1", 1, gpus.size());
Assert.assertEquals("Product name of GPU does not match",
"testGpu", gpus.get(0).getProductName());
}
@Test
public void testGetNMResourceInfoAutoDiscoveryDisabled()
throws YarnException {
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
when(gpuDiscoverer.isAutoDiscoveryEnabled()).thenReturn(false);
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
mock(GpuNodeResourceUpdateHandler.class);
GpuResourcePlugin target =
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
target.createResourceHandler(null, null, null);
NMGpuResourceInfo resourceInfo =
(NMGpuResourceInfo) target.getNMResourceInfo();
Assert.assertNull(resourceInfo.getGpuDeviceInformation());
}
}