YARN-9118. Handle exceptions with parsing user defined GPU devices in GpuDiscoverer. Contributed by Szilard Nemeth.

This commit is contained in:
Sunil G 2019-02-22 20:22:17 +05:30
parent 9636fe4114
commit 95fbbfed75
6 changed files with 366 additions and 91 deletions

View File

@ -67,11 +67,10 @@ public List<PrivilegedOperation> bootstrap(Configuration configuration)
throws ResourceHandlerException {
List<GpuDevice> usableGpus;
try {
usableGpus = GpuDiscoverer.getInstance()
.getGpusUsableByYarn();
usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn();
if (usableGpus == null || usableGpus.isEmpty()) {
String message = "GPU is enabled on the NodeManager, but couldn't find "
+ "any usable GPU devices, please double check configuration.";
+ "any usable GPU devices, please double check configuration!";
LOG.error(message);
throw new ResourceHandlerException(message);
}

View File

@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
/**
* This exception is to be thrown when allowed GPU device specification
* is empty or invalid.
*/
public final class GpuDeviceSpecificationException extends YarnException {
private static final String VALID_FORMAT_MESSAGE = "The valid format " +
"should be: index:minor_number";
private GpuDeviceSpecificationException(String message) {
super(message);
}
private GpuDeviceSpecificationException(String message, Exception cause) {
super(message, cause);
}
public static GpuDeviceSpecificationException createWithEmptyValueSpecified() {
return new GpuDeviceSpecificationException(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES +
" is set to an empty value! Please specify " +
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES +
" to enable auto-discovery or " +
"please enter the GPU device IDs manually! " +
VALID_FORMAT_MESSAGE);
}
public static GpuDeviceSpecificationException createWithWrongValueSpecified(
String device, String configValue, Exception cause) {
final String message = createIllegalFormatMessage(device, configValue);
return new GpuDeviceSpecificationException(message, cause);
}
public static GpuDeviceSpecificationException createWithWrongValueSpecified(
String device, String configValue) {
final String message = createIllegalFormatMessage(device, configValue);
return new GpuDeviceSpecificationException(message);
}
public static GpuDeviceSpecificationException createWithDuplicateValueSpecified(
String device, String configValue) {
final String message = createDuplicateFormatMessage(device, configValue);
return new GpuDeviceSpecificationException(message);
}
private static String createIllegalFormatMessage(String device,
String configValue) {
return String.format("Illegal format of individual GPU device: %s, " +
"the whole config value was: '%s'! " + VALID_FORMAT_MESSAGE,
device, configValue);
}
private static String createDuplicateFormatMessage(String device,
String configValue) {
return String.format("GPU device %s" +
" has a duplicate definition! " +
"Please double-check the configuration " +
YarnConfiguration.NM_GPU_ALLOWED_DEVICES +
"! Current value of the configuration is: %s",
device, configValue);
}
}

View File

@ -19,8 +19,8 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@ -47,7 +47,7 @@ public class GpuDiscoverer {
public static final Logger LOG = LoggerFactory.getLogger(
GpuDiscoverer.class);
@VisibleForTesting
protected static final String DEFAULT_BINARY_NAME = "nvidia-smi";
static final String DEFAULT_BINARY_NAME = "nvidia-smi";
// When executable path not set, try to search default dirs
// By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
@ -70,7 +70,7 @@ public class GpuDiscoverer {
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
private int numOfErrorExecutionSinceLastSucceed = 0;
GpuDeviceInformation lastDiscoveredGpuInformation = null;
private GpuDeviceInformation lastDiscoveredGpuInformation = null;
private void validateConfOrThrowException() throws YarnException {
if (conf == null) {
@ -89,7 +89,7 @@ private void validateConfOrThrowException() throws YarnException {
* @return GpuDeviceInformation
* @throws YarnException when any error happens
*/
public synchronized GpuDeviceInformation getGpuDeviceInformation()
synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException {
validateConfOrThrowException();
@ -112,10 +112,9 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
try {
output = Shell.execCommand(environment,
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
GpuDeviceInformation info = parser.parseXml(output);
lastDiscoveredGpuInformation = parser.parseXml(output);
numOfErrorExecutionSinceLastSucceed = 0;
lastDiscoveredGpuInformation = info;
return info;
return lastDiscoveredGpuInformation;
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg =
@ -149,52 +148,91 @@ public synchronized List<GpuDevice> getGpusUsableByYarn()
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
List<GpuDevice> gpuDevices = new ArrayList<>();
if (allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
// Get gpu device information from system.
if (null == lastDiscoveredGpuInformation) {
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
+ ", however automatically discovering "
+ "GPU information failed, please check NodeManager log for more"
+ " details, as an alternative, admin can specify "
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
+ " manually to enable GPU isolation.";
LOG.error(msg);
throw new YarnException(msg);
}
if (lastDiscoveredGpuInformation.getGpus() != null) {
for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size();
i++) {
List<PerGpuDeviceInformation> gpuInfos =
lastDiscoveredGpuInformation.getGpus();
gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
}
}
} else{
for (String s : allowedDevicesStr.split(",")) {
if (s.trim().length() > 0) {
String[] kv = s.trim().split(":");
if (kv.length != 2) {
throw new YarnException(
"Illegal format, it should be index:minor_number format, now it="
+ s);
}
gpuDevices.add(
new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1])));
}
}
LOG.info("Allowed GPU devices:" + gpuDevices);
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
} else {
return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr);
}
}
private List<GpuDevice> parseGpuDevicesFromAutoDiscoveredGpuInfo()
throws YarnException {
if (lastDiscoveredGpuInformation == null) {
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
+ ", however automatically discovering "
+ "GPU information failed, please check NodeManager log for more"
+ " details, as an alternative, admin can specify "
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
+ " manually to enable GPU isolation.";
LOG.error(msg);
throw new YarnException(msg);
}
List<GpuDevice> gpuDevices = new ArrayList<>();
if (lastDiscoveredGpuInformation.getGpus() != null) {
int numberOfGpus = lastDiscoveredGpuInformation.getGpus().size();
LOG.debug("Found {} GPU devices", numberOfGpus);
for (int i = 0; i < numberOfGpus; i++) {
List<PerGpuDeviceInformation> gpuInfos =
lastDiscoveredGpuInformation.getGpus();
gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
}
}
return gpuDevices;
}
/**
* @param devices allowed devices coming from the config.
* Individual devices should be separated by commas.
* <br>The format of individual devices should be:
* &lt;index:&gt;&lt;minorNumber&gt;
* @return List of GpuDevices
* @throws YarnException when a GPU device is defined as a duplicate.
* The first duplicate GPU device will be added to the exception message.
*/
private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices)
throws YarnException {
if (devices.trim().isEmpty()) {
throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
}
List<GpuDevice> gpuDevices = Lists.newArrayList();
for (String device : devices.split(",")) {
if (device.trim().length() > 0) {
String[] splitByColon = device.trim().split(":");
if (splitByColon.length != 2) {
throw GpuDeviceSpecificationException.
createWithWrongValueSpecified(device, devices);
}
GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
if (!gpuDevices.contains(gpuDevice)) {
gpuDevices.add(gpuDevice);
} else {
throw GpuDeviceSpecificationException
.createWithDuplicateValueSpecified(device, devices);
}
}
}
LOG.info("Allowed GPU devices:" + gpuDevices);
return gpuDevices;
}
public synchronized void initialize(Configuration conf) throws YarnException {
private GpuDevice parseGpuDevice(String device, String[] splitByColon,
String allowedDevicesStr) throws YarnException {
try {
int index = Integer.parseInt(splitByColon[0]);
int minorNumber = Integer.parseInt(splitByColon[1]);
return new GpuDevice(index, minorNumber);
} catch (NumberFormatException e) {
throw GpuDeviceSpecificationException.
createWithWrongValueSpecified(device, allowedDevicesStr, e);
}
}
public synchronized void initialize(Configuration conf) {
this.conf = conf;
numOfErrorExecutionSinceLastSucceed = 0;
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
@ -203,9 +241,7 @@ public synchronized void initialize(Configuration conf) throws YarnException {
pathToExecutable = DEFAULT_BINARY_NAME;
}
// Validate file existence
File binaryPath = new File(pathToExecutable);
if (!binaryPath.exists()) {
// When binary not exist, use default setting.
boolean found = false;
@ -249,12 +285,12 @@ public synchronized void initialize(Configuration conf) throws YarnException {
}
@VisibleForTesting
protected Map<String, String> getEnvironmentToRunCommand() {
Map<String, String> getEnvironmentToRunCommand() {
return environment;
}
@VisibleForTesting
protected String getPathOfGpuBinary() {
String getPathOfGpuBinary() {
return pathOfGpuBinary;
}

View File

@ -40,11 +40,11 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
public void updateConfiguredResource(Resource res) throws YarnException {
LOG.info("Initializing configured GPU resources for the NodeManager.");
List<GpuDevice> usableGpus =
GpuDiscoverer.getInstance().getGpusUsableByYarn();
if (null == usableGpus || usableGpus.isEmpty()) {
String message = "GPU is enabled, but couldn't find any usable GPUs on the "
+ "NodeManager.";
List<GpuDevice> usableGpus = GpuDiscoverer.getInstance()
.getGpusUsableByYarn();
if (usableGpus == null || usableGpus.isEmpty()) {
String message = "GPU is enabled, " +
"but couldn't find any usable GPUs on the NodeManager!";
LOG.error(message);
// No gpu can be used by YARN.
throw new YarnException(message);

View File

@ -0,0 +1,20 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Package for GPU support classes.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;

View File

@ -23,17 +23,26 @@
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
public class TestGpuDiscoverer {
@Rule
public ExpectedException exception = ExpectedException.none();
private String getTestParentFolder() {
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
return f.getAbsolutePath();
@ -51,6 +60,12 @@ public void before() throws IOException {
f.mkdirs();
}
private Configuration createConfigWithAllowedDevices(String s) {
Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
return conf;
}
@Test
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
// Only run this on demand.
@ -61,10 +76,10 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
Configuration conf = new Configuration(false);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
plugin.getPathOfGpuBinary());
Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
Assert.assertTrue(
assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
assertTrue(
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
// test case 2, check mandatory set path.
@ -74,18 +89,18 @@ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
plugin = new GpuDiscoverer();
plugin.initialize(conf);
Assert.assertEquals(fakeBinary.getAbsolutePath(),
assertEquals(fakeBinary.getAbsolutePath(),
plugin.getPathOfGpuBinary());
Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
// test case 3, check mandatory set path, but binary doesn't exist so default
// path will be used.
fakeBinary.delete();
plugin = new GpuDiscoverer();
plugin.initialize(conf);
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
plugin.getPathOfGpuBinary());
Assert.assertTrue(
assertTrue(
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
}
@ -100,42 +115,165 @@ public void testGpuDiscover() throws YarnException {
plugin.initialize(conf);
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
Assert.assertTrue(info.getGpus().size() > 0);
Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
assertTrue(info.getGpus().size() > 0);
assertEquals(plugin.getGpusUsableByYarn().size(),
info.getGpus().size());
}
@Test
public void getNumberOfUsableGpusFromConfig() throws YarnException {
Configuration conf = new Configuration(false);
public void testGetNumberOfUsableGpusFromConfigSingleDevice()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("1:2");
// Illegal format
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3");
GpuDiscoverer plugin = new GpuDiscoverer();
try {
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
Assert.fail("Illegal format, should fail.");
} catch (YarnException e) {
// Expected
}
plugin.initialize(conf);
List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
assertEquals(1, usableGpuDevices.size());
// Valid format
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4");
plugin = new GpuDiscoverer();
assertEquals(1, usableGpuDevices.get(0).getIndex());
assertEquals(2, usableGpuDevices.get(0).getMinorNumber());
}
@Test
public void testGetNumberOfUsableGpusFromConfigIllegalFormat()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
Assert.assertEquals(4, usableGpuDevices.size());
assertEquals(4, usableGpuDevices.size());
Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
assertEquals(0, usableGpuDevices.get(0).getIndex());
assertEquals(0, usableGpuDevices.get(0).getMinorNumber());
Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
assertEquals(1, usableGpuDevices.get(1).getIndex());
assertEquals(1, usableGpuDevices.get(1).getMinorNumber());
assertEquals(2, usableGpuDevices.get(2).getIndex());
assertEquals(2, usableGpuDevices.get(2).getMinorNumber());
assertEquals(3, usableGpuDevices.get(3).getIndex());
assertEquals(4, usableGpuDevices.get(3).getMinorNumber());
}
@Test
public void testGetNumberOfUsableGpusFromConfigDuplicateValues()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,1:1");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigDuplicateValues2()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,1:1,2:2");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigIncludingSpaces()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0 : 0,1 : 1");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigIncludingGibberish()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:@$1,1:1");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigIncludingLetters()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("x:0, 1:y");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigWithoutIndexNumber()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices(":0, :1");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigEmptyString()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigValueWithoutComma()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0 0:1");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigValueWithoutComma2()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0.1 0.2");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGetNumberOfUsableGpusFromConfigValueWithoutColonSeparator()
throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0.1,0.2");
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
}