YARN-9337. GPU auto-discovery script runs even when the resource is given by hand. Contributed by Adam Antal
(cherry picked from commit 61b0c2bb7c
)
This commit is contained in:
parent
43c89d1e2b
commit
531e0c0bc1
|
@ -69,6 +69,8 @@ public class GpuDiscoverer {
|
|||
private int numOfErrorExecutionSinceLastSucceed = 0;
|
||||
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
|
||||
|
||||
private List<GpuDevice> gpuDevicesFromUser;
|
||||
|
||||
private void validateConfOrThrowException() throws YarnException {
|
||||
if (conf == null) {
|
||||
throw new YarnException("Please initialize (call initialize) before use "
|
||||
|
@ -143,6 +145,14 @@ public class GpuDiscoverer {
|
|||
}
|
||||
}
|
||||
|
||||
private boolean IsAutoDiscoveryEnabled() {
|
||||
String allowedDevicesStr = conf.get(
|
||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
return allowedDevicesStr.equals(
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of GPU devices usable by YARN.
|
||||
*
|
||||
|
@ -153,15 +163,13 @@ public class GpuDiscoverer {
|
|||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
String allowedDevicesStr = conf.get(
|
||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
|
||||
if (allowedDevicesStr.equals(
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
|
||||
if (IsAutoDiscoveryEnabled()) {
|
||||
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
||||
} else {
|
||||
return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr);
|
||||
if (gpuDevicesFromUser == null) {
|
||||
gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
|
||||
}
|
||||
return gpuDevicesFromUser;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -193,16 +201,16 @@ public class GpuDiscoverer {
|
|||
}
|
||||
|
||||
/**
|
||||
* @param devices allowed devices coming from the config.
|
||||
* Individual devices should be separated by commas.
|
||||
* <br>The format of individual devices should be:
|
||||
* <index:><minorNumber>
|
||||
* @return List of GpuDevices
|
||||
* @throws YarnException when a GPU device is defined as a duplicate.
|
||||
* The first duplicate GPU device will be added to the exception message.
|
||||
*/
|
||||
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices)
|
||||
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
|
||||
throws YarnException {
|
||||
String devices = conf.get(
|
||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
|
||||
if (devices.trim().isEmpty()) {
|
||||
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
|
||||
}
|
||||
|
@ -244,19 +252,21 @@ public class GpuDiscoverer {
|
|||
public synchronized void initialize(Configuration config)
|
||||
throws YarnException {
|
||||
this.conf = config;
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
lookUpAutoDiscoveryBinary(config);
|
||||
if (IsAutoDiscoveryEnabled()) {
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
lookUpAutoDiscoveryBinary(config);
|
||||
|
||||
// Try to discover GPU information once and print
|
||||
try {
|
||||
LOG.info("Trying to discover GPU information ...");
|
||||
GpuDeviceInformation info = getGpuDeviceInformation();
|
||||
LOG.info("Discovered GPU information: " + info.toString());
|
||||
} catch (YarnException e) {
|
||||
String msg =
|
||||
"Failed to discover GPU information from system, exception message:"
|
||||
+ e.getMessage() + " continue...";
|
||||
LOG.warn(msg);
|
||||
// Try to discover GPU information once and print
|
||||
try {
|
||||
LOG.info("Trying to discover GPU information ...");
|
||||
GpuDeviceInformation info = getGpuDeviceInformation();
|
||||
LOG.info("Discovered GPU information: " + info.toString());
|
||||
} catch (YarnException e) {
|
||||
String msg =
|
||||
"Failed to discover GPU information from system, exception message:"
|
||||
+ e.getMessage() + " continue...";
|
||||
LOG.warn(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -40,6 +40,7 @@ import java.util.List;
|
|||
import java.util.function.Consumer;
|
||||
|
||||
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_GPU_ALLOWED_DEVICES;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME;
|
||||
import static org.hamcrest.CoreMatchers.containsString;
|
||||
import static org.hamcrest.CoreMatchers.not;
|
||||
|
@ -49,6 +50,9 @@ import static org.junit.Assert.assertNull;
|
|||
import static org.junit.Assert.assertThat;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
public class TestGpuDiscoverer {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(
|
||||
|
@ -96,7 +100,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
private Configuration createConfigWithAllowedDevices(String s) {
|
||||
Configuration conf = new Configuration(false);
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
|
||||
conf.set(NM_GPU_ALLOWED_DEVICES, s);
|
||||
setupFakeBinary(conf);
|
||||
return conf;
|
||||
}
|
||||
|
@ -495,4 +499,17 @@ public class TestGpuDiscoverer {
|
|||
"executable in the default directories:"));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScriptNotCalled() throws YarnException {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
|
||||
|
||||
GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
|
||||
|
||||
gpuSpy.initialize(conf);
|
||||
gpuSpy.getGpusUsableByYarn();
|
||||
|
||||
verify(gpuSpy, never()).getGpuDeviceInformation();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue