YARN-9337. GPU auto-discovery script runs even when the resource is given by hand. Contributed by Adam Antal

This commit is contained in:
Szilard Nemeth 2019-07-12 17:28:14 +02:00
parent 8b3c6791b1
commit 61b0c2bb7c
2 changed files with 52 additions and 25 deletions

View File

@ -69,6 +69,8 @@ public class GpuDiscoverer {
private int numOfErrorExecutionSinceLastSucceed = 0;
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
private List<GpuDevice> gpuDevicesFromUser;
private void validateConfOrThrowException() throws YarnException {
if (conf == null) {
throw new YarnException("Please initialize (call initialize) before use "
@ -141,6 +143,14 @@ synchronized GpuDeviceInformation getGpuDeviceInformation()
}
}
private boolean IsAutoDiscoveryEnabled() {
String allowedDevicesStr = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
return allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
}
/**
* Get list of GPU devices usable by YARN.
*
@ -151,15 +161,13 @@ public synchronized List<GpuDevice> getGpusUsableByYarn()
throws YarnException {
validateConfOrThrowException();
String allowedDevicesStr = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
if (allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
if (IsAutoDiscoveryEnabled()) {
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
} else {
return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr);
if (gpuDevicesFromUser == null) {
gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
}
return gpuDevicesFromUser;
}
}
@ -191,16 +199,16 @@ private List<GpuDevice> parseGpuDevicesFromAutoDiscoveredGpuInfo()
}
/**
* @param devices allowed devices coming from the config.
* Individual devices should be separated by commas.
* <br>The format of individual devices should be:
* &lt;index:&gt;&lt;minorNumber&gt;
* @return List of GpuDevices
* @throws YarnException when a GPU device is defined as a duplicate.
* The first duplicate GPU device will be added to the exception message.
*/
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices)
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
throws YarnException {
String devices = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
if (devices.trim().isEmpty()) {
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
}
@ -242,19 +250,21 @@ private GpuDevice parseGpuDevice(String device, String[] splitByColon,
public synchronized void initialize(Configuration config)
throws YarnException {
this.conf = config;
numOfErrorExecutionSinceLastSucceed = 0;
lookUpAutoDiscoveryBinary(config);
if (IsAutoDiscoveryEnabled()) {
numOfErrorExecutionSinceLastSucceed = 0;
lookUpAutoDiscoveryBinary(config);
// Try to discover GPU information once and print
try {
LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info("Discovered GPU information: " + info.toString());
} catch (YarnException e) {
String msg =
"Failed to discover GPU information from system, exception message:"
+ e.getMessage() + " continue...";
LOG.warn(msg);
// Try to discover GPU information once and print
try {
LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info("Discovered GPU information: " + info.toString());
} catch (YarnException e) {
String msg =
"Failed to discover GPU information from system, exception message:"
+ e.getMessage() + " continue...";
LOG.warn(msg);
}
}
}

View File

@ -40,6 +40,7 @@
import java.util.function.Consumer;
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_GPU_ALLOWED_DEVICES;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.not;
@ -49,6 +50,9 @@
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
public class TestGpuDiscoverer {
private static final Logger LOG = LoggerFactory.getLogger(
@ -96,7 +100,7 @@ public void before() throws IOException {
private Configuration createConfigWithAllowedDevices(String s) {
Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
conf.set(NM_GPU_ALLOWED_DEVICES, s);
setupFakeBinary(conf);
return conf;
}
@ -495,4 +499,17 @@ public void testGpuBinaryIsANotExistingFile() {
"executable in the default directories:"));
}
}
@Test
public void testScriptNotCalled() throws YarnException {
Configuration conf = new Configuration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
gpuSpy.initialize(conf);
gpuSpy.getGpusUsableByYarn();
verify(gpuSpy, never()).getGpuDeviceInformation();
}
}