YARN-9118. Handle exceptions with parsing user defined GPU devices in GpuDiscoverer. Contributed by Szilard Nemeth.
This commit is contained in:
parent
9636fe4114
commit
95fbbfed75
|
@ -67,11 +67,10 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
||||||
throws ResourceHandlerException {
|
throws ResourceHandlerException {
|
||||||
List<GpuDevice> usableGpus;
|
List<GpuDevice> usableGpus;
|
||||||
try {
|
try {
|
||||||
usableGpus = GpuDiscoverer.getInstance()
|
usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn();
|
||||||
.getGpusUsableByYarn();
|
|
||||||
if (usableGpus == null || usableGpus.isEmpty()) {
|
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||||
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
||||||
+ "any usable GPU devices, please double check configuration.";
|
+ "any usable GPU devices, please double check configuration!";
|
||||||
LOG.error(message);
|
LOG.error(message);
|
||||||
throw new ResourceHandlerException(message);
|
throw new ResourceHandlerException(message);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This exception is to be thrown when allowed GPU device specification
|
||||||
|
* is empty or invalid.
|
||||||
|
*/
|
||||||
|
public final class GpuDeviceSpecificationException extends YarnException {
|
||||||
|
private static final String VALID_FORMAT_MESSAGE = "The valid format " +
|
||||||
|
"should be: index:minor_number";
|
||||||
|
|
||||||
|
private GpuDeviceSpecificationException(String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
private GpuDeviceSpecificationException(String message, Exception cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static GpuDeviceSpecificationException createWithEmptyValueSpecified() {
|
||||||
|
return new GpuDeviceSpecificationException(
|
||||||
|
YarnConfiguration.NM_GPU_ALLOWED_DEVICES +
|
||||||
|
" is set to an empty value! Please specify " +
|
||||||
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES +
|
||||||
|
" to enable auto-discovery or " +
|
||||||
|
"please enter the GPU device IDs manually! " +
|
||||||
|
VALID_FORMAT_MESSAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static GpuDeviceSpecificationException createWithWrongValueSpecified(
|
||||||
|
String device, String configValue, Exception cause) {
|
||||||
|
final String message = createIllegalFormatMessage(device, configValue);
|
||||||
|
return new GpuDeviceSpecificationException(message, cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static GpuDeviceSpecificationException createWithWrongValueSpecified(
|
||||||
|
String device, String configValue) {
|
||||||
|
final String message = createIllegalFormatMessage(device, configValue);
|
||||||
|
return new GpuDeviceSpecificationException(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static GpuDeviceSpecificationException createWithDuplicateValueSpecified(
|
||||||
|
String device, String configValue) {
|
||||||
|
final String message = createDuplicateFormatMessage(device, configValue);
|
||||||
|
return new GpuDeviceSpecificationException(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String createIllegalFormatMessage(String device,
|
||||||
|
String configValue) {
|
||||||
|
return String.format("Illegal format of individual GPU device: %s, " +
|
||||||
|
"the whole config value was: '%s'! " + VALID_FORMAT_MESSAGE,
|
||||||
|
device, configValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String createDuplicateFormatMessage(String device,
|
||||||
|
String configValue) {
|
||||||
|
return String.format("GPU device %s" +
|
||||||
|
" has a duplicate definition! " +
|
||||||
|
"Please double-check the configuration " +
|
||||||
|
YarnConfiguration.NM_GPU_ALLOWED_DEVICES +
|
||||||
|
"! Current value of the configuration is: %s",
|
||||||
|
device, configValue);
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,8 +19,8 @@
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.collect.ImmutableMap;
|
|
||||||
import com.google.common.collect.ImmutableSet;
|
import com.google.common.collect.ImmutableSet;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.classification.InterfaceStability;
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -47,7 +47,7 @@ public class GpuDiscoverer {
|
||||||
public static final Logger LOG = LoggerFactory.getLogger(
|
public static final Logger LOG = LoggerFactory.getLogger(
|
||||||
GpuDiscoverer.class);
|
GpuDiscoverer.class);
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected static final String DEFAULT_BINARY_NAME = "nvidia-smi";
|
static final String DEFAULT_BINARY_NAME = "nvidia-smi";
|
||||||
|
|
||||||
// When executable path not set, try to search default dirs
|
// When executable path not set, try to search default dirs
|
||||||
// By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
|
// By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
|
||||||
|
@ -70,7 +70,7 @@ public class GpuDiscoverer {
|
||||||
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
||||||
|
|
||||||
private int numOfErrorExecutionSinceLastSucceed = 0;
|
private int numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
GpuDeviceInformation lastDiscoveredGpuInformation = null;
|
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
|
||||||
|
|
||||||
private void validateConfOrThrowException() throws YarnException {
|
private void validateConfOrThrowException() throws YarnException {
|
||||||
if (conf == null) {
|
if (conf == null) {
|
||||||
|
@ -89,7 +89,7 @@ public class GpuDiscoverer {
|
||||||
* @return GpuDeviceInformation
|
* @return GpuDeviceInformation
|
||||||
* @throws YarnException when any error happens
|
* @throws YarnException when any error happens
|
||||||
*/
|
*/
|
||||||
public synchronized GpuDeviceInformation getGpuDeviceInformation()
|
synchronized GpuDeviceInformation getGpuDeviceInformation()
|
||||||
throws YarnException {
|
throws YarnException {
|
||||||
validateConfOrThrowException();
|
validateConfOrThrowException();
|
||||||
|
|
||||||
|
@ -112,10 +112,9 @@ public class GpuDiscoverer {
|
||||||
try {
|
try {
|
||||||
output = Shell.execCommand(environment,
|
output = Shell.execCommand(environment,
|
||||||
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
|
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
|
||||||
GpuDeviceInformation info = parser.parseXml(output);
|
lastDiscoveredGpuInformation = parser.parseXml(output);
|
||||||
numOfErrorExecutionSinceLastSucceed = 0;
|
numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
lastDiscoveredGpuInformation = info;
|
return lastDiscoveredGpuInformation;
|
||||||
return info;
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
numOfErrorExecutionSinceLastSucceed++;
|
numOfErrorExecutionSinceLastSucceed++;
|
||||||
String msg =
|
String msg =
|
||||||
|
@ -149,52 +148,91 @@ public class GpuDiscoverer {
|
||||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||||
|
|
||||||
List<GpuDevice> gpuDevices = new ArrayList<>();
|
|
||||||
|
|
||||||
if (allowedDevicesStr.equals(
|
if (allowedDevicesStr.equals(
|
||||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
|
||||||
// Get gpu device information from system.
|
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
||||||
if (null == lastDiscoveredGpuInformation) {
|
} else {
|
||||||
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
|
return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr);
|
||||||
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
|
|
||||||
+ ", however automatically discovering "
|
|
||||||
+ "GPU information failed, please check NodeManager log for more"
|
|
||||||
+ " details, as an alternative, admin can specify "
|
|
||||||
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
|
|
||||||
+ " manually to enable GPU isolation.";
|
|
||||||
LOG.error(msg);
|
|
||||||
throw new YarnException(msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lastDiscoveredGpuInformation.getGpus() != null) {
|
|
||||||
for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size();
|
|
||||||
i++) {
|
|
||||||
List<PerGpuDeviceInformation> gpuInfos =
|
|
||||||
lastDiscoveredGpuInformation.getGpus();
|
|
||||||
gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else{
|
|
||||||
for (String s : allowedDevicesStr.split(",")) {
|
|
||||||
if (s.trim().length() > 0) {
|
|
||||||
String[] kv = s.trim().split(":");
|
|
||||||
if (kv.length != 2) {
|
|
||||||
throw new YarnException(
|
|
||||||
"Illegal format, it should be index:minor_number format, now it="
|
|
||||||
+ s);
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuDevices.add(
|
|
||||||
new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1])));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOG.info("Allowed GPU devices:" + gpuDevices);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<GpuDevice> parseGpuDevicesFromAutoDiscoveredGpuInfo()
|
||||||
|
throws YarnException {
|
||||||
|
if (lastDiscoveredGpuInformation == null) {
|
||||||
|
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
|
||||||
|
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
|
||||||
|
+ ", however automatically discovering "
|
||||||
|
+ "GPU information failed, please check NodeManager log for more"
|
||||||
|
+ " details, as an alternative, admin can specify "
|
||||||
|
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
|
||||||
|
+ " manually to enable GPU isolation.";
|
||||||
|
LOG.error(msg);
|
||||||
|
throw new YarnException(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<GpuDevice> gpuDevices = new ArrayList<>();
|
||||||
|
if (lastDiscoveredGpuInformation.getGpus() != null) {
|
||||||
|
int numberOfGpus = lastDiscoveredGpuInformation.getGpus().size();
|
||||||
|
LOG.debug("Found {} GPU devices", numberOfGpus);
|
||||||
|
for (int i = 0; i < numberOfGpus; i++) {
|
||||||
|
List<PerGpuDeviceInformation> gpuInfos =
|
||||||
|
lastDiscoveredGpuInformation.getGpus();
|
||||||
|
gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return gpuDevices;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param devices allowed devices coming from the config.
|
||||||
|
* Individual devices should be separated by commas.
|
||||||
|
* <br>The format of individual devices should be:
|
||||||
|
* <index:><minorNumber>
|
||||||
|
* @return List of GpuDevices
|
||||||
|
* @throws YarnException when a GPU device is defined as a duplicate.
|
||||||
|
* The first duplicate GPU device will be added to the exception message.
|
||||||
|
*/
|
||||||
|
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices)
|
||||||
|
throws YarnException {
|
||||||
|
if (devices.trim().isEmpty()) {
|
||||||
|
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
|
||||||
|
}
|
||||||
|
List<GpuDevice> gpuDevices = Lists.newArrayList();
|
||||||
|
for (String device : devices.split(",")) {
|
||||||
|
if (device.trim().length() > 0) {
|
||||||
|
String[] splitByColon = device.trim().split(":");
|
||||||
|
if (splitByColon.length != 2) {
|
||||||
|
throw GpuDeviceSpecificationException.
|
||||||
|
createWithWrongValueSpecified(device, devices);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
|
||||||
|
if (!gpuDevices.contains(gpuDevice)) {
|
||||||
|
gpuDevices.add(gpuDevice);
|
||||||
|
} else {
|
||||||
|
throw GpuDeviceSpecificationException
|
||||||
|
.createWithDuplicateValueSpecified(device, devices);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG.info("Allowed GPU devices:" + gpuDevices);
|
||||||
|
|
||||||
return gpuDevices;
|
return gpuDevices;
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void initialize(Configuration conf) throws YarnException {
|
private GpuDevice parseGpuDevice(String device, String[] splitByColon,
|
||||||
|
String allowedDevicesStr) throws YarnException {
|
||||||
|
try {
|
||||||
|
int index = Integer.parseInt(splitByColon[0]);
|
||||||
|
int minorNumber = Integer.parseInt(splitByColon[1]);
|
||||||
|
return new GpuDevice(index, minorNumber);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
throw GpuDeviceSpecificationException.
|
||||||
|
createWithWrongValueSpecified(device, allowedDevicesStr, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void initialize(Configuration conf) {
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
numOfErrorExecutionSinceLastSucceed = 0;
|
numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
||||||
|
@ -203,9 +241,7 @@ public class GpuDiscoverer {
|
||||||
pathToExecutable = DEFAULT_BINARY_NAME;
|
pathToExecutable = DEFAULT_BINARY_NAME;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate file existence
|
|
||||||
File binaryPath = new File(pathToExecutable);
|
File binaryPath = new File(pathToExecutable);
|
||||||
|
|
||||||
if (!binaryPath.exists()) {
|
if (!binaryPath.exists()) {
|
||||||
// When binary not exist, use default setting.
|
// When binary not exist, use default setting.
|
||||||
boolean found = false;
|
boolean found = false;
|
||||||
|
@ -249,12 +285,12 @@ public class GpuDiscoverer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected Map<String, String> getEnvironmentToRunCommand() {
|
Map<String, String> getEnvironmentToRunCommand() {
|
||||||
return environment;
|
return environment;
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected String getPathOfGpuBinary() {
|
String getPathOfGpuBinary() {
|
||||||
return pathOfGpuBinary;
|
return pathOfGpuBinary;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,11 +40,11 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||||
public void updateConfiguredResource(Resource res) throws YarnException {
|
public void updateConfiguredResource(Resource res) throws YarnException {
|
||||||
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
||||||
|
|
||||||
List<GpuDevice> usableGpus =
|
List<GpuDevice> usableGpus = GpuDiscoverer.getInstance()
|
||||||
GpuDiscoverer.getInstance().getGpusUsableByYarn();
|
.getGpusUsableByYarn();
|
||||||
if (null == usableGpus || usableGpus.isEmpty()) {
|
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||||
String message = "GPU is enabled, but couldn't find any usable GPUs on the "
|
String message = "GPU is enabled, " +
|
||||||
+ "NodeManager.";
|
"but couldn't find any usable GPUs on the NodeManager!";
|
||||||
LOG.error(message);
|
LOG.error(message);
|
||||||
// No gpu can be used by YARN.
|
// No gpu can be used by YARN.
|
||||||
throw new YarnException(message);
|
throw new YarnException(message);
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Package for GPU support classes.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
|
@ -23,17 +23,26 @@ import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||||
import org.junit.Assert;
|
|
||||||
import org.junit.Assume;
|
import org.junit.Assume;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
public class TestGpuDiscoverer {
|
public class TestGpuDiscoverer {
|
||||||
|
@Rule
|
||||||
|
public ExpectedException exception = ExpectedException.none();
|
||||||
|
|
||||||
private String getTestParentFolder() {
|
private String getTestParentFolder() {
|
||||||
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
|
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
|
||||||
return f.getAbsolutePath();
|
return f.getAbsolutePath();
|
||||||
|
@ -51,6 +60,12 @@ public class TestGpuDiscoverer {
|
||||||
f.mkdirs();
|
f.mkdirs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Configuration createConfigWithAllowedDevices(String s) {
|
||||||
|
Configuration conf = new Configuration(false);
|
||||||
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
|
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
|
||||||
// Only run this on demand.
|
// Only run this on demand.
|
||||||
|
@ -61,10 +76,10 @@ public class TestGpuDiscoverer {
|
||||||
Configuration conf = new Configuration(false);
|
Configuration conf = new Configuration(false);
|
||||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
plugin.initialize(conf);
|
plugin.initialize(conf);
|
||||||
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
|
assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
|
||||||
plugin.getPathOfGpuBinary());
|
plugin.getPathOfGpuBinary());
|
||||||
Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
|
assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
|
||||||
Assert.assertTrue(
|
assertTrue(
|
||||||
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
||||||
|
|
||||||
// test case 2, check mandatory set path.
|
// test case 2, check mandatory set path.
|
||||||
|
@ -74,18 +89,18 @@ public class TestGpuDiscoverer {
|
||||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
|
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
|
||||||
plugin = new GpuDiscoverer();
|
plugin = new GpuDiscoverer();
|
||||||
plugin.initialize(conf);
|
plugin.initialize(conf);
|
||||||
Assert.assertEquals(fakeBinary.getAbsolutePath(),
|
assertEquals(fakeBinary.getAbsolutePath(),
|
||||||
plugin.getPathOfGpuBinary());
|
plugin.getPathOfGpuBinary());
|
||||||
Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
|
assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
|
||||||
|
|
||||||
// test case 3, check mandatory set path, but binary doesn't exist so default
|
// test case 3, check mandatory set path, but binary doesn't exist so default
|
||||||
// path will be used.
|
// path will be used.
|
||||||
fakeBinary.delete();
|
fakeBinary.delete();
|
||||||
plugin = new GpuDiscoverer();
|
plugin = new GpuDiscoverer();
|
||||||
plugin.initialize(conf);
|
plugin.initialize(conf);
|
||||||
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
|
assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
|
||||||
plugin.getPathOfGpuBinary());
|
plugin.getPathOfGpuBinary());
|
||||||
Assert.assertTrue(
|
assertTrue(
|
||||||
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,42 +115,165 @@ public class TestGpuDiscoverer {
|
||||||
plugin.initialize(conf);
|
plugin.initialize(conf);
|
||||||
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
|
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
|
||||||
|
|
||||||
Assert.assertTrue(info.getGpus().size() > 0);
|
assertTrue(info.getGpus().size() > 0);
|
||||||
Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
|
assertEquals(plugin.getGpusUsableByYarn().size(),
|
||||||
info.getGpus().size());
|
info.getGpus().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void getNumberOfUsableGpusFromConfig() throws YarnException {
|
public void testGetNumberOfUsableGpusFromConfigSingleDevice()
|
||||||
Configuration conf = new Configuration(false);
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("1:2");
|
||||||
|
|
||||||
// Illegal format
|
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3");
|
|
||||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
try {
|
plugin.initialize(conf);
|
||||||
plugin.initialize(conf);
|
List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
|
||||||
plugin.getGpusUsableByYarn();
|
assertEquals(1, usableGpuDevices.size());
|
||||||
Assert.fail("Illegal format, should fail.");
|
|
||||||
} catch (YarnException e) {
|
|
||||||
// Expected
|
|
||||||
}
|
|
||||||
|
|
||||||
// Valid format
|
assertEquals(1, usableGpuDevices.get(0).getIndex());
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4");
|
assertEquals(2, usableGpuDevices.get(0).getMinorNumber());
|
||||||
plugin = new GpuDiscoverer();
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigIllegalFormat()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
plugin.initialize(conf);
|
plugin.initialize(conf);
|
||||||
|
|
||||||
List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
|
List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
|
||||||
Assert.assertEquals(4, usableGpuDevices.size());
|
assertEquals(4, usableGpuDevices.size());
|
||||||
|
|
||||||
Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
|
assertEquals(0, usableGpuDevices.get(0).getIndex());
|
||||||
Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
|
assertEquals(0, usableGpuDevices.get(0).getMinorNumber());
|
||||||
Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
|
|
||||||
Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
|
|
||||||
|
|
||||||
Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
|
assertEquals(1, usableGpuDevices.get(1).getIndex());
|
||||||
Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
|
assertEquals(1, usableGpuDevices.get(1).getMinorNumber());
|
||||||
Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
|
|
||||||
Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
|
assertEquals(2, usableGpuDevices.get(2).getIndex());
|
||||||
|
assertEquals(2, usableGpuDevices.get(2).getMinorNumber());
|
||||||
|
|
||||||
|
assertEquals(3, usableGpuDevices.get(3).getIndex());
|
||||||
|
assertEquals(4, usableGpuDevices.get(3).getMinorNumber());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigDuplicateValues()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,1:1");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigDuplicateValues2()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,1:1,2:2");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigIncludingSpaces()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0 : 0,1 : 1");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigIncludingGibberish()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0:@$1,1:1");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigIncludingLetters()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("x:0, 1:y");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigWithoutIndexNumber()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices(":0, :1");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigEmptyString()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigValueWithoutComma()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0:0 0:1");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigValueWithoutComma2()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0.1 0.2");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetNumberOfUsableGpusFromConfigValueWithoutColonSeparator()
|
||||||
|
throws YarnException {
|
||||||
|
Configuration conf = createConfigWithAllowedDevices("0.1,0.2");
|
||||||
|
|
||||||
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
|
plugin.initialize(conf);
|
||||||
|
plugin.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue