YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko

This commit is contained in:
Szilard Nemeth 2019-08-21 16:49:34 +02:00
parent 69255fa1b9
commit 6980f1740f
11 changed files with 278 additions and 97 deletions

View File

@ -1612,6 +1612,20 @@ public class YarnConfiguration extends Configuration {
public static final String NM_RESOURCE_PLUGINS = public static final String NM_RESOURCE_PLUGINS =
NM_PREFIX + "resource-plugins"; NM_PREFIX + "resource-plugins";
/**
* Specifies whether the initialization of the Node Manager should continue
* if a certain device (GPU, FPGA, etc) was not found in the system. If set
* to "true", then an exception will be thrown if a device is missing or
* an error occurred during discovery.
*/
@Private
public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
NM_RESOURCE_PLUGINS + ".fail-fast";
@Private
public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
/** /**
* Prefix for gpu configurations. Work in progress: This configuration * Prefix for gpu configurations. Work in progress: This configuration
* parameter may be changed/removed in the future. * parameter may be changed/removed in the future.

View File

@ -3798,6 +3798,17 @@
<value></value> <value></value>
</property> </property>
<property>
<description>
Specifies whether the initialization of the Node Manager should continue
if a certain device (GPU, FPGA, etc) was not found in the system. If set
to "true", then an exception will be thrown if a device is missing or
an error occurred during discovery.
</description>
<name>yarn.nodemanager.resource-plugins.fail-fast</name>
<value></value>
</property>
<property> <property>
<description> <description>
Specify GPU devices which can be managed by YARN NodeManager, split by comma Specify GPU devices which can be managed by YARN NodeManager, split by comma

View File

@ -0,0 +1,42 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.exceptions.YarnException;
/**
* Small utility class which only re-throws YarnException if
* NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
*
*/
public final class ResourcesExceptionUtil {
private ResourcesExceptionUtil() {}
public static void throwIfNecessary(YarnException e, Configuration conf)
throws YarnException {
if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
throw e;
}
}
}

View File

@ -18,6 +18,12 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -36,10 +42,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class GpuResourceHandlerImpl implements ResourceHandler { public class GpuResourceHandlerImpl implements ResourceHandler {
final static Log LOG = LogFactory final static Log LOG = LogFactory
.getLog(GpuResourceHandlerImpl.class); .getLog(GpuResourceHandlerImpl.class);
@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
String message = "GPU is enabled on the NodeManager, but couldn't find " String message = "GPU is enabled on the NodeManager, but couldn't find "
+ "any usable GPU devices, please double check configuration!"; + "any usable GPU devices, please double check configuration!";
LOG.error(message); LOG.error(message);
throw new ResourceHandlerException(message); throwIfNecessary(new ResourceHandlerException(message),
configuration);
} }
} catch (YarnException e) { } catch (YarnException e) {
LOG.error("Exception when trying to get usable GPU device", e); LOG.error("Exception when trying to get usable GPU device", e);

View File

@ -60,7 +60,7 @@ public class ResourcePluginManager {
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap(); Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
if (plugins != null) { if (plugins != null) {
pluginMap = initializePlugins(context, plugins); pluginMap = initializePlugins(conf, context, plugins);
} }
configuredPlugins = Collections.unmodifiableMap(pluginMap); configuredPlugins = Collections.unmodifiableMap(pluginMap);
@ -77,8 +77,7 @@ public class ResourcePluginManager {
return plugins; return plugins;
} }
private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
private Map<String, ResourcePlugin> initializePlugins(
Context context, String[] plugins) throws YarnException { Context context, String[] plugins) throws YarnException {
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap(); Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
@ -91,7 +90,7 @@ public class ResourcePluginManager {
if (resourceName.equals(GPU_URI)) { if (resourceName.equals(GPU_URI)) {
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer(); final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
final GpuNodeResourceUpdateHandler updateHandler = final GpuNodeResourceUpdateHandler updateHandler =
new GpuNodeResourceUpdateHandler(gpuDiscoverer); new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer); plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
} else if (resourceName.equals(FPGA_URI)) { } else if (resourceName.equals(FPGA_URI)) {
plugin = new FpgaResourcePlugin(); plugin = new FpgaResourcePlugin();

View File

@ -18,21 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import com.google.common.annotations.VisibleForTesting; import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -42,6 +28,22 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
@InterfaceAudience.Private @InterfaceAudience.Private
@InterfaceStability.Unstable @InterfaceStability.Unstable
@ -57,11 +59,10 @@ public class GpuDiscoverer {
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
"/usr/bin", "/bin", "/usr/local/nvidia/bin"); "/usr/bin", "/bin", "/usr/local/nvidia/bin");
// command should not run more than 10 sec.
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
private static final int MAX_REPEATED_ERROR_ALLOWED = 10; private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private Configuration conf = null; private Configuration conf = null;
private NvidiaBinaryHelper nvidiaBinaryHelper;
private String pathOfGpuBinary = null; private String pathOfGpuBinary = null;
private Map<String, String> environment = new HashMap<>(); private Map<String, String> environment = new HashMap<>();
@ -110,24 +111,17 @@ public class GpuDiscoverer {
* @return GpuDeviceInformation * @return GpuDeviceInformation
* @throws YarnException when any error happens * @throws YarnException when any error happens
*/ */
synchronized GpuDeviceInformation getGpuDeviceInformation() public synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException { throws YarnException {
validateConfOrThrowException();
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg = getErrorMessageOfScriptExecutionThresholdReached(); String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg); LOG.error(msg);
throw new YarnException(msg); throw new YarnException(msg);
} }
String output;
try { try {
output = Shell.execCommand(environment, lastDiscoveredGpuInformation =
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
lastDiscoveredGpuInformation = parser.parseXml(output);
numOfErrorExecutionSinceLastSucceed = 0;
return lastDiscoveredGpuInformation;
} catch (IOException e) { } catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++; numOfErrorExecutionSinceLastSucceed++;
String msg = getErrorMessageOfScriptExecution(e.getMessage()); String msg = getErrorMessageOfScriptExecution(e.getMessage());
@ -138,17 +132,18 @@ public class GpuDiscoverer {
} catch (YarnException e) { } catch (YarnException e) {
numOfErrorExecutionSinceLastSucceed++; numOfErrorExecutionSinceLastSucceed++;
String msg = getFailedToParseErrorMessage(e.getMessage()); String msg = getFailedToParseErrorMessage(e.getMessage());
if (LOG.isDebugEnabled()) { LOG.debug(msg, e);
LOG.warn(msg, e);
}
throw e; throw e;
} }
return lastDiscoveredGpuInformation;
} }
private boolean IsAutoDiscoveryEnabled() { private boolean isAutoDiscoveryEnabled() {
String allowedDevicesStr = conf.get( String allowedDevicesStr = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES, YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
return allowedDevicesStr.equals( return allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
} }
@ -157,13 +152,12 @@ public class GpuDiscoverer {
* Get list of GPU devices usable by YARN. * Get list of GPU devices usable by YARN.
* *
* @return List of GPU devices * @return List of GPU devices
* @throws YarnException when any issue happens
*/ */
public synchronized List<GpuDevice> getGpusUsableByYarn() public synchronized List<GpuDevice> getGpusUsableByYarn()
throws YarnException { throws YarnException {
validateConfOrThrowException(); validateConfOrThrowException();
if (IsAutoDiscoveryEnabled()) { if (isAutoDiscoveryEnabled()) {
return parseGpuDevicesFromAutoDiscoveredGpuInfo(); return parseGpuDevicesFromAutoDiscoveredGpuInfo();
} else { } else {
if (gpuDevicesFromUser == null) { if (gpuDevicesFromUser == null) {
@ -219,16 +213,27 @@ public class GpuDiscoverer {
if (device.trim().length() > 0) { if (device.trim().length() > 0) {
String[] splitByColon = device.trim().split(":"); String[] splitByColon = device.trim().split(":");
if (splitByColon.length != 2) { if (splitByColon.length != 2) {
throw GpuDeviceSpecificationException. throwIfNecessary(GpuDeviceSpecificationException
createWithWrongValueSpecified(device, devices); .createWithWrongValueSpecified(device, devices), conf);
LOG.warn("Wrong GPU specification string {}, ignored", device);
}
GpuDevice gpuDevice;
try {
gpuDevice = parseGpuDevice(splitByColon);
} catch (NumberFormatException e) {
throwIfNecessary(GpuDeviceSpecificationException
.createWithWrongValueSpecified(device, devices, e), conf);
LOG.warn("Cannot parse GPU device numbers: {}", device);
continue;
} }
GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
if (!gpuDevices.contains(gpuDevice)) { if (!gpuDevices.contains(gpuDevice)) {
gpuDevices.add(gpuDevice); gpuDevices.add(gpuDevice);
} else { } else {
throw GpuDeviceSpecificationException throwIfNecessary(GpuDeviceSpecificationException
.createWithDuplicateValueSpecified(device, devices); .createWithDuplicateValueSpecified(device, devices), conf);
LOG.warn("CPU device is duplicated: {}", device);
} }
} }
} }
@ -237,22 +242,18 @@ public class GpuDiscoverer {
return gpuDevices; return gpuDevices;
} }
private GpuDevice parseGpuDevice(String device, String[] splitByColon, private GpuDevice parseGpuDevice(String[] splitByColon) {
String allowedDevicesStr) throws YarnException { int index = Integer.parseInt(splitByColon[0]);
try { int minorNumber = Integer.parseInt(splitByColon[1]);
int index = Integer.parseInt(splitByColon[0]); return new GpuDevice(index, minorNumber);
int minorNumber = Integer.parseInt(splitByColon[1]);
return new GpuDevice(index, minorNumber);
} catch (NumberFormatException e) {
throw GpuDeviceSpecificationException.
createWithWrongValueSpecified(device, allowedDevicesStr, e);
}
} }
public synchronized void initialize(Configuration config)
throws YarnException { public synchronized void initialize(Configuration config,
NvidiaBinaryHelper nvidiaHelper) throws YarnException {
this.conf = config; this.conf = config;
if (IsAutoDiscoveryEnabled()) { this.nvidiaBinaryHelper = nvidiaHelper;
if (isAutoDiscoveryEnabled()) {
numOfErrorExecutionSinceLastSucceed = 0; numOfErrorExecutionSinceLastSucceed = 0;
lookUpAutoDiscoveryBinary(config); lookUpAutoDiscoveryBinary(config);
@ -286,7 +287,18 @@ public class GpuDiscoverer {
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile); binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
} else { } else {
binaryPath = configuredBinaryFile; binaryPath = configuredBinaryFile;
// If path exists but file name is incorrect don't execute the file
String fileName = binaryPath.getName();
if (DEFAULT_BINARY_NAME.equals(fileName)) {
String msg = String.format("Please check the configuration value of"
+" %s. It should point to an %s binary.",
YarnConfiguration.NM_GPU_PATH_TO_EXEC,
DEFAULT_BINARY_NAME);
throwIfNecessary(new YarnException(msg), config);
LOG.warn(msg);
}
} }
pathOfGpuBinary = binaryPath.getAbsolutePath(); pathOfGpuBinary = binaryPath.getAbsolutePath();
} }

View File

@ -18,6 +18,9 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
private static final Logger LOG = private static final Logger LOG =
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
private final GpuDiscoverer gpuDiscoverer; private final GpuDiscoverer gpuDiscoverer;
private Configuration conf;
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) { public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
Configuration conf) {
this.gpuDiscoverer = gpuDiscoverer; this.gpuDiscoverer = gpuDiscoverer;
this.conf = conf;
} }
@Override @Override
@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
"but could not find any usable GPUs on the NodeManager!"; "but could not find any usable GPUs on the NodeManager!";
LOG.error(message); LOG.error(message);
// No gpu can be used by YARN. // No gpu can be used by YARN.
throw new YarnException(message); throwIfNecessary(new YarnException(message), conf);
return;
} }
long nUsableGpus = usableGpus.size(); long nUsableGpus = usableGpus.size();
@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
Map<String, ResourceInformation> configuredResourceTypes = Map<String, ResourceInformation> configuredResourceTypes =
ResourceUtils.getResourceTypes(); ResourceUtils.getResourceTypes();
if (!configuredResourceTypes.containsKey(GPU_URI)) { if (!configuredResourceTypes.containsKey(GPU_URI)) {
throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
+ GPU_URI + GPU_URI
+ " resource-type is not configured inside" + " resource-type is not configured inside"
+ " resource-types.xml, please configure it to enable GPU feature or" + " resource-types.xml, please configure it to enable GPU feature or"

View File

@ -18,6 +18,8 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import java.util.List;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import java.util.List;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler; private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
private final GpuDiscoverer gpuDiscoverer; private final GpuDiscoverer gpuDiscoverer;
public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private int numOfErrorExecutionSinceLastSucceed = 0;
private GpuResourceHandlerImpl gpuResourceHandler = null; private GpuResourceHandlerImpl gpuResourceHandler = null;
private DockerCommandPlugin dockerCommandPlugin = null; private DockerCommandPlugin dockerCommandPlugin = null;
@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
@Override @Override
public void initialize(Context context) throws YarnException { public void initialize(Context context) throws YarnException {
this.gpuDiscoverer.initialize(context.getConf()); this.gpuDiscoverer.initialize(context.getConf(),
new NvidiaBinaryHelper());
this.dockerCommandPlugin = this.dockerCommandPlugin =
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
context.getConf()); context.getConf());
@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
@Override @Override
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException { public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
GpuDeviceInformation gpuDeviceInformation = GpuDeviceInformation gpuDeviceInformation;
gpuDiscoverer.getGpuDeviceInformation();
//At this point the gpu plugin is already enabled //At this point the gpu plugin is already enabled
checkGpuResourceHandler(); checkGpuResourceHandler();
checkErrorCount();
try{
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
numOfErrorExecutionSinceLastSucceed = 0;
} catch (YarnException e) {
LOG.error(e.getMessage(), e);
numOfErrorExecutionSinceLastSucceed++;
throw e;
}
GpuResourceAllocator gpuResourceAllocator = GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator(); gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus(); List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
} }
} }
private void checkErrorCount() throws YarnException {
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg =
"Failed to execute GPU device information detection script for "
+ MAX_REPEATED_ERROR_ALLOWED
+ " times, skip following executions.";
LOG.error(msg);
throw new YarnException(msg);
}
}
@Override @Override
public String toString() { public String toString() {
return GpuResourcePlugin.class.getName(); return GpuResourcePlugin.class.getName();

View File

@ -0,0 +1,63 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
/**
* Executes the "nvidia-smi" command and returns an object
* based on its output.
*
*/
public class NvidiaBinaryHelper {
/**
* command should not run more than 10 sec.
*/
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
/**
* @param pathOfGpuBinary The path of the binary
* @return the GpuDeviceInformation parsed from the nvidia-smi output
* @throws IOException if the binary output is not readable
* @throws YarnException if the pathOfGpuBinary is null,
* or the output parse failed
*/
synchronized GpuDeviceInformation getGpuDeviceInformation(
String pathOfGpuBinary) throws IOException, YarnException {
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
if (pathOfGpuBinary == null) {
throw new YarnException(
"Failed to find GPU discovery executable, please double check "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
}
String output = Shell.execCommand(new HashMap<>(),
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
return parser.parseXml(output);
}
}

View File

@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@ -116,11 +117,13 @@ public class TestGpuResourceHandler {
@Rule @Rule
public ExpectedException expected = ExpectedException.none(); public ExpectedException expected = ExpectedException.none();
private NvidiaBinaryHelper nvidiaBinaryHelper;
@Before @Before
public void setup() throws IOException { public void setup() throws IOException {
createTestDataDirectory(); createTestDataDirectory();
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
nvidiaBinaryHelper = new NvidiaBinaryHelper();
mockCGroupsHandler = mock(CGroupsHandler.class); mockCGroupsHandler = mock(CGroupsHandler.class);
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class); mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
@ -146,13 +149,14 @@ public class TestGpuResourceHandler {
@After @After
public void cleanupTestFiles() throws IOException { public void cleanupTestFiles() throws IOException {
FileUtils.deleteDirectory(testDataDirectory); FileUtils.deleteDirectory(testDataDirectory);
nvidiaBinaryHelper = new NvidiaBinaryHelper();
} }
@Test @Test
public void testBootstrapWithRealGpuDiscoverer() throws Exception { public void testBootstrapWithRealGpuDiscoverer() throws Exception {
Configuration conf = createDefaultConfig(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
@ -170,7 +174,7 @@ public class TestGpuResourceHandler {
public void testBootstrapWithMockGpuDiscoverer() throws Exception { public void testBootstrapWithMockGpuDiscoverer() throws Exception {
GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class); GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
Configuration conf = new YarnConfiguration(); Configuration conf = new YarnConfiguration();
mockDiscoverer.initialize(conf); mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
expected.expect(ResourceHandlerException.class); expected.expect(ResourceHandlerException.class);
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
@ -270,7 +274,7 @@ public class TestGpuResourceHandler {
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer = new GpuDiscoverer(); gpuDiscoverer = new GpuDiscoverer();
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
Context nmContext = createMockNmContext(conf); Context nmContext = createMockNmContext(conf);
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext, gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer); mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
@ -379,7 +383,7 @@ public class TestGpuResourceHandler {
public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = createDefaultConfig(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
try { try {
gpuResourceHandler.bootstrap(conf); gpuResourceHandler.bootstrap(conf);
@ -460,7 +464,7 @@ public class TestGpuResourceHandler {
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
mockPrivilegedExecutor, gpuDiscoverer); mockPrivilegedExecutor, gpuDiscoverer);
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
gpuNULLStateResourceHandler.bootstrap(conf); gpuNULLStateResourceHandler.bootstrap(conf);
verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler); verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);

View File

@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
private static final String BASH_SHEBANG = "#!/bin/bash\n\n"; private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
private static final String TEST_PARENT_DIR = new File("target/temp/" + private static final String TEST_PARENT_DIR = new File("target/temp/" +
TestGpuDiscoverer.class.getName()).getAbsolutePath(); TestGpuDiscoverer.class.getName()).getAbsolutePath();
private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
@Rule @Rule
public ExpectedException exception = ExpectedException.none(); public ExpectedException exception = ExpectedException.none();
@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
Configuration conf) throws YarnException { Configuration conf) throws YarnException {
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR); conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
return discoverer; return discoverer;
} }
@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
// test case 1, check default setting. // test case 1, check default setting.
Configuration conf = new Configuration(false); Configuration conf = new Configuration(false);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary()); assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
assertNvidiaIsOnPath(discoverer); assertNvidiaIsOnPath(discoverer);
// test case 2, check mandatory set path. // test case 2, check mandatory set path.
File fakeBinary = setupFakeBinary(conf); File fakeBinary = setupFakeBinary(conf);
discoverer = new GpuDiscoverer(); discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
assertEquals(fakeBinary.getAbsolutePath(), assertEquals(fakeBinary.getAbsolutePath(),
discoverer.getPathOfGpuBinary()); discoverer.getPathOfGpuBinary());
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
// but binary doesn't exist so default path will be used. // but binary doesn't exist so default path will be used.
fakeBinary.delete(); fakeBinary.delete();
discoverer = new GpuDiscoverer(); discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
assertEquals(DEFAULT_BINARY_NAME, assertEquals(DEFAULT_BINARY_NAME,
discoverer.getPathOfGpuBinary()); discoverer.getPathOfGpuBinary());
assertNvidiaIsOnPath(discoverer); assertNvidiaIsOnPath(discoverer);
@ -310,14 +311,14 @@ public class TestGpuDiscoverer {
} }
@Test @Test
public void testGpuDiscover() throws YarnException { public void testGpuDiscover() throws YarnException, IOException {
// Since this is more of a performance unit test, only run if // Since this is more of a performance unit test, only run if
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true) // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
Assume.assumeTrue( Assume.assumeTrue(
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest"))); Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
Configuration conf = new Configuration(false); Configuration conf = new Configuration(false);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
GpuDeviceInformation info = discoverer.getGpuDeviceInformation(); GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
assertTrue(info.getGpus().size() > 0); assertTrue(info.getGpus().size() > 0);
@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
Configuration conf = createConfigWithAllowedDevices("1:2"); Configuration conf = createConfigWithAllowedDevices("1:2");
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn(); List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
assertEquals(1, usableGpuDevices.size()); assertEquals(1, usableGpuDevices.size());
@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
public void testGetNumberOfUsableGpusFromConfig() throws YarnException { public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4"); Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn(); List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
assertEquals(4, usableGpuDevices.size()); assertEquals(4, usableGpuDevices.size());
@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class); exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer(); GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf); discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn(); discoverer.getGpusUsableByYarn();
} }
@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla"); conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
GpuDiscoverer plugin = new GpuDiscoverer(); GpuDiscoverer plugin = new GpuDiscoverer();
try { try {
plugin.initialize(conf); plugin.initialize(conf, binaryHelper);
plugin.getGpusUsableByYarn(); plugin.getGpusUsableByYarn();
fail("Illegal format, should fail."); fail("Illegal format, should fail.");
} catch (YarnException e) { } catch (YarnException e) {
@ -501,15 +502,15 @@ public class TestGpuDiscoverer {
} }
@Test @Test
public void testScriptNotCalled() throws YarnException { public void testScriptNotCalled() throws YarnException, IOException {
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
GpuDiscoverer gpuSpy = spy(new GpuDiscoverer()); GpuDiscoverer gpuSpy = spy(new GpuDiscoverer());
gpuSpy.initialize(conf); gpuSpy.initialize(conf, binaryHelper);
gpuSpy.getGpusUsableByYarn(); gpuSpy.getGpusUsableByYarn();
verify(gpuSpy, never()).getGpuDeviceInformation(); verify(gpuSpy, never()).getGpuDeviceInformation();
} }
} }