YARN-9337. GPU auto-discovery script runs even when the resource is given by hand. Contributed by Adam Antal
(cherry picked from commit 61b0c2bb7c
)
This commit is contained in:
parent
43c89d1e2b
commit
531e0c0bc1
|
@ -69,6 +69,8 @@ public class GpuDiscoverer {
|
||||||
private int numOfErrorExecutionSinceLastSucceed = 0;
|
private int numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
|
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
|
||||||
|
|
||||||
|
private List<GpuDevice> gpuDevicesFromUser;
|
||||||
|
|
||||||
private void validateConfOrThrowException() throws YarnException {
|
private void validateConfOrThrowException() throws YarnException {
|
||||||
if (conf == null) {
|
if (conf == null) {
|
||||||
throw new YarnException("Please initialize (call initialize) before use "
|
throw new YarnException("Please initialize (call initialize) before use "
|
||||||
|
@ -143,6 +145,14 @@ public class GpuDiscoverer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean IsAutoDiscoveryEnabled() {
|
||||||
|
String allowedDevicesStr = conf.get(
|
||||||
|
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||||
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||||
|
return allowedDevicesStr.equals(
|
||||||
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get list of GPU devices usable by YARN.
|
* Get list of GPU devices usable by YARN.
|
||||||
*
|
*
|
||||||
|
@ -153,15 +163,13 @@ public class GpuDiscoverer {
|
||||||
throws YarnException {
|
throws YarnException {
|
||||||
validateConfOrThrowException();
|
validateConfOrThrowException();
|
||||||
|
|
||||||
String allowedDevicesStr = conf.get(
|
if (IsAutoDiscoveryEnabled()) {
|
||||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
|
||||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
|
||||||
|
|
||||||
if (allowedDevicesStr.equals(
|
|
||||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
|
|
||||||
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
||||||
} else {
|
} else {
|
||||||
return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr);
|
if (gpuDevicesFromUser == null) {
|
||||||
|
gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
|
||||||
|
}
|
||||||
|
return gpuDevicesFromUser;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,16 +201,16 @@ public class GpuDiscoverer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param devices allowed devices coming from the config.
|
|
||||||
* Individual devices should be separated by commas.
|
|
||||||
* <br>The format of individual devices should be:
|
|
||||||
* <index:><minorNumber>
|
|
||||||
* @return List of GpuDevices
|
* @return List of GpuDevices
|
||||||
* @throws YarnException when a GPU device is defined as a duplicate.
|
* @throws YarnException when a GPU device is defined as a duplicate.
|
||||||
* The first duplicate GPU device will be added to the exception message.
|
* The first duplicate GPU device will be added to the exception message.
|
||||||
*/
|
*/
|
||||||
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices)
|
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
|
||||||
throws YarnException {
|
throws YarnException {
|
||||||
|
String devices = conf.get(
|
||||||
|
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||||
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||||
|
|
||||||
if (devices.trim().isEmpty()) {
|
if (devices.trim().isEmpty()) {
|
||||||
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
|
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
|
||||||
}
|
}
|
||||||
|
@ -244,19 +252,21 @@ public class GpuDiscoverer {
|
||||||
public synchronized void initialize(Configuration config)
|
public synchronized void initialize(Configuration config)
|
||||||
throws YarnException {
|
throws YarnException {
|
||||||
this.conf = config;
|
this.conf = config;
|
||||||
numOfErrorExecutionSinceLastSucceed = 0;
|
if (IsAutoDiscoveryEnabled()) {
|
||||||
lookUpAutoDiscoveryBinary(config);
|
numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
|
lookUpAutoDiscoveryBinary(config);
|
||||||
|
|
||||||
// Try to discover GPU information once and print
|
// Try to discover GPU information once and print
|
||||||
try {
|
try {
|
||||||
LOG.info("Trying to discover GPU information ...");
|
LOG.info("Trying to discover GPU information ...");
|
||||||
GpuDeviceInformation info = getGpuDeviceInformation();
|
GpuDeviceInformation info = getGpuDeviceInformation();
|
||||||
LOG.info("Discovered GPU information: " + info.toString());
|
LOG.info("Discovered GPU information: " + info.toString());
|
||||||
} catch (YarnException e) {
|
} catch (YarnException e) {
|
||||||
String msg =
|
String msg =
|
||||||
"Failed to discover GPU information from system, exception message:"
|
"Failed to discover GPU information from system, exception message:"
|
||||||
+ e.getMessage() + " continue...";
|
+ e.getMessage() + " continue...";
|
||||||
LOG.warn(msg);
|
LOG.warn(msg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,6 +40,7 @@ import java.util.List;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
|
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
|
||||||
|
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_GPU_ALLOWED_DEVICES;
|
||||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME;
|
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME;
|
||||||
import static org.hamcrest.CoreMatchers.containsString;
|
import static org.hamcrest.CoreMatchers.containsString;
|
||||||
import static org.hamcrest.CoreMatchers.not;
|
import static org.hamcrest.CoreMatchers.not;
|
||||||
|
@ -49,6 +50,9 @@ import static org.junit.Assert.assertNull;
|
||||||
import static org.junit.Assert.assertThat;
|
import static org.junit.Assert.assertThat;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
import static org.mockito.Mockito.never;
|
||||||
|
import static org.mockito.Mockito.spy;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
public class TestGpuDiscoverer {
|
public class TestGpuDiscoverer {
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(
|
private static final Logger LOG = LoggerFactory.getLogger(
|
||||||
|
@ -96,7 +100,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
private Configuration createConfigWithAllowedDevices(String s) {
|
private Configuration createConfigWithAllowedDevices(String s) {
|
||||||
Configuration conf = new Configuration(false);
|
Configuration conf = new Configuration(false);
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
|
conf.set(NM_GPU_ALLOWED_DEVICES, s);
|
||||||
setupFakeBinary(conf);
|
setupFakeBinary(conf);
|
||||||
return conf;
|
return conf;
|
||||||
}
|
}
|
||||||
|
@ -495,4 +499,17 @@ public class TestGpuDiscoverer {
|
||||||
"executable in the default directories:"));
|
"executable in the default directories:"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testScriptNotCalled() throws YarnException {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
|
||||||
|
|
||||||
|
GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
|
||||||
|
|
||||||
|
gpuSpy.initialize(conf);
|
||||||
|
gpuSpy.getGpusUsableByYarn();
|
||||||
|
|
||||||
|
verify(gpuSpy, never()).getGpuDeviceInformation();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue