YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko
This commit is contained in:
parent
69255fa1b9
commit
6980f1740f
|
@ -1612,6 +1612,20 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final String NM_RESOURCE_PLUGINS =
|
||||
NM_PREFIX + "resource-plugins";
|
||||
|
||||
|
||||
/**
|
||||
* Specifies whether the initialization of the Node Manager should continue
|
||||
* if a certain device (GPU, FPGA, etc) was not found in the system. If set
|
||||
* to "true", then an exception will be thrown if a device is missing or
|
||||
* an error occurred during discovery.
|
||||
*/
|
||||
@Private
|
||||
public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
|
||||
NM_RESOURCE_PLUGINS + ".fail-fast";
|
||||
|
||||
@Private
|
||||
public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
|
||||
|
||||
/**
|
||||
* Prefix for gpu configurations. Work in progress: This configuration
|
||||
* parameter may be changed/removed in the future.
|
||||
|
|
|
@ -3798,6 +3798,17 @@
|
|||
<value></value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Specifies whether the initialization of the Node Manager should continue
|
||||
if a certain device (GPU, FPGA, etc) was not found in the system. If set
|
||||
to "true", then an exception will be thrown if a device is missing or
|
||||
an error occurred during discovery.
|
||||
</description>
|
||||
<name>yarn.nodemanager.resource-plugins.fail-fast</name>
|
||||
<value></value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Specify GPU devices which can be managed by YARN NodeManager, split by comma
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
|
||||
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
|
||||
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
|
||||
/**
|
||||
* Small utility class which only re-throws YarnException if
|
||||
* NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
|
||||
*
|
||||
*/
|
||||
public final class ResourcesExceptionUtil {
|
||||
private ResourcesExceptionUtil() {}
|
||||
|
||||
public static void throwIfNecessary(YarnException e, Configuration conf)
|
||||
throws YarnException {
|
||||
if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
|
||||
DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,6 +18,12 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -36,10 +42,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class GpuResourceHandlerImpl implements ResourceHandler {
|
||||
final static Log LOG = LogFactory
|
||||
.getLog(GpuResourceHandlerImpl.class);
|
||||
|
@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
|||
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
||||
+ "any usable GPU devices, please double check configuration!";
|
||||
LOG.error(message);
|
||||
throw new ResourceHandlerException(message);
|
||||
throwIfNecessary(new ResourceHandlerException(message),
|
||||
configuration);
|
||||
}
|
||||
} catch (YarnException e) {
|
||||
LOG.error("Exception when trying to get usable GPU device", e);
|
||||
|
|
|
@ -60,7 +60,7 @@ public class ResourcePluginManager {
|
|||
|
||||
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
|
||||
if (plugins != null) {
|
||||
pluginMap = initializePlugins(context, plugins);
|
||||
pluginMap = initializePlugins(conf, context, plugins);
|
||||
}
|
||||
|
||||
configuredPlugins = Collections.unmodifiableMap(pluginMap);
|
||||
|
@ -77,8 +77,7 @@ public class ResourcePluginManager {
|
|||
return plugins;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, ResourcePlugin> initializePlugins(
|
||||
private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
|
||||
Context context, String[] plugins) throws YarnException {
|
||||
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
|
||||
|
||||
|
@ -91,7 +90,7 @@ public class ResourcePluginManager {
|
|||
if (resourceName.equals(GPU_URI)) {
|
||||
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
|
||||
final GpuNodeResourceUpdateHandler updateHandler =
|
||||
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
|
||||
new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
|
||||
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
|
||||
} else if (resourceName.equals(FPGA_URI)) {
|
||||
plugin = new FpgaResourcePlugin();
|
||||
|
|
|
@ -18,21 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
@ -42,6 +28,22 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
|
@ -57,11 +59,10 @@ public class GpuDiscoverer {
|
|||
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
|
||||
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
|
||||
|
||||
// command should not run more than 10 sec.
|
||||
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
||||
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||
|
||||
private Configuration conf = null;
|
||||
private NvidiaBinaryHelper nvidiaBinaryHelper;
|
||||
private String pathOfGpuBinary = null;
|
||||
private Map<String, String> environment = new HashMap<>();
|
||||
|
||||
|
@ -110,24 +111,17 @@ public class GpuDiscoverer {
|
|||
* @return GpuDeviceInformation
|
||||
* @throws YarnException when any error happens
|
||||
*/
|
||||
synchronized GpuDeviceInformation getGpuDeviceInformation()
|
||||
public synchronized GpuDeviceInformation getGpuDeviceInformation()
|
||||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
||||
String msg = getErrorMessageOfScriptExecutionThresholdReached();
|
||||
LOG.error(msg);
|
||||
throw new YarnException(msg);
|
||||
}
|
||||
|
||||
String output;
|
||||
try {
|
||||
output = Shell.execCommand(environment,
|
||||
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
|
||||
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
||||
lastDiscoveredGpuInformation = parser.parseXml(output);
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
return lastDiscoveredGpuInformation;
|
||||
lastDiscoveredGpuInformation =
|
||||
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
|
||||
} catch (IOException e) {
|
||||
numOfErrorExecutionSinceLastSucceed++;
|
||||
String msg = getErrorMessageOfScriptExecution(e.getMessage());
|
||||
|
@ -138,17 +132,18 @@ public class GpuDiscoverer {
|
|||
} catch (YarnException e) {
|
||||
numOfErrorExecutionSinceLastSucceed++;
|
||||
String msg = getFailedToParseErrorMessage(e.getMessage());
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.warn(msg, e);
|
||||
}
|
||||
LOG.debug(msg, e);
|
||||
throw e;
|
||||
}
|
||||
|
||||
return lastDiscoveredGpuInformation;
|
||||
}
|
||||
|
||||
private boolean IsAutoDiscoveryEnabled() {
|
||||
private boolean isAutoDiscoveryEnabled() {
|
||||
String allowedDevicesStr = conf.get(
|
||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
|
||||
return allowedDevicesStr.equals(
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
}
|
||||
|
@ -157,13 +152,12 @@ public class GpuDiscoverer {
|
|||
* Get list of GPU devices usable by YARN.
|
||||
*
|
||||
* @return List of GPU devices
|
||||
* @throws YarnException when any issue happens
|
||||
*/
|
||||
public synchronized List<GpuDevice> getGpusUsableByYarn()
|
||||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
if (IsAutoDiscoveryEnabled()) {
|
||||
if (isAutoDiscoveryEnabled()) {
|
||||
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
||||
} else {
|
||||
if (gpuDevicesFromUser == null) {
|
||||
|
@ -219,16 +213,27 @@ public class GpuDiscoverer {
|
|||
if (device.trim().length() > 0) {
|
||||
String[] splitByColon = device.trim().split(":");
|
||||
if (splitByColon.length != 2) {
|
||||
throw GpuDeviceSpecificationException.
|
||||
createWithWrongValueSpecified(device, devices);
|
||||
throwIfNecessary(GpuDeviceSpecificationException
|
||||
.createWithWrongValueSpecified(device, devices), conf);
|
||||
LOG.warn("Wrong GPU specification string {}, ignored", device);
|
||||
}
|
||||
|
||||
GpuDevice gpuDevice;
|
||||
try {
|
||||
gpuDevice = parseGpuDevice(splitByColon);
|
||||
} catch (NumberFormatException e) {
|
||||
throwIfNecessary(GpuDeviceSpecificationException
|
||||
.createWithWrongValueSpecified(device, devices, e), conf);
|
||||
LOG.warn("Cannot parse GPU device numbers: {}", device);
|
||||
continue;
|
||||
}
|
||||
|
||||
GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
|
||||
if (!gpuDevices.contains(gpuDevice)) {
|
||||
gpuDevices.add(gpuDevice);
|
||||
} else {
|
||||
throw GpuDeviceSpecificationException
|
||||
.createWithDuplicateValueSpecified(device, devices);
|
||||
throwIfNecessary(GpuDeviceSpecificationException
|
||||
.createWithDuplicateValueSpecified(device, devices), conf);
|
||||
LOG.warn("CPU device is duplicated: {}", device);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -237,22 +242,18 @@ public class GpuDiscoverer {
|
|||
return gpuDevices;
|
||||
}
|
||||
|
||||
private GpuDevice parseGpuDevice(String device, String[] splitByColon,
|
||||
String allowedDevicesStr) throws YarnException {
|
||||
try {
|
||||
int index = Integer.parseInt(splitByColon[0]);
|
||||
int minorNumber = Integer.parseInt(splitByColon[1]);
|
||||
return new GpuDevice(index, minorNumber);
|
||||
} catch (NumberFormatException e) {
|
||||
throw GpuDeviceSpecificationException.
|
||||
createWithWrongValueSpecified(device, allowedDevicesStr, e);
|
||||
}
|
||||
private GpuDevice parseGpuDevice(String[] splitByColon) {
|
||||
int index = Integer.parseInt(splitByColon[0]);
|
||||
int minorNumber = Integer.parseInt(splitByColon[1]);
|
||||
return new GpuDevice(index, minorNumber);
|
||||
}
|
||||
|
||||
public synchronized void initialize(Configuration config)
|
||||
throws YarnException {
|
||||
|
||||
public synchronized void initialize(Configuration config,
|
||||
NvidiaBinaryHelper nvidiaHelper) throws YarnException {
|
||||
this.conf = config;
|
||||
if (IsAutoDiscoveryEnabled()) {
|
||||
this.nvidiaBinaryHelper = nvidiaHelper;
|
||||
if (isAutoDiscoveryEnabled()) {
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
lookUpAutoDiscoveryBinary(config);
|
||||
|
||||
|
@ -286,7 +287,18 @@ public class GpuDiscoverer {
|
|||
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
|
||||
} else {
|
||||
binaryPath = configuredBinaryFile;
|
||||
// If path exists but file name is incorrect don't execute the file
|
||||
String fileName = binaryPath.getName();
|
||||
if (DEFAULT_BINARY_NAME.equals(fileName)) {
|
||||
String msg = String.format("Please check the configuration value of"
|
||||
+" %s. It should point to an %s binary.",
|
||||
YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
||||
DEFAULT_BINARY_NAME);
|
||||
throwIfNecessary(new YarnException(msg), config);
|
||||
LOG.warn(msg);
|
||||
}
|
||||
}
|
||||
|
||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,9 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
|
@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
|||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
||||
private final GpuDiscoverer gpuDiscoverer;
|
||||
private Configuration conf;
|
||||
|
||||
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
|
||||
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
|
||||
Configuration conf) {
|
||||
this.gpuDiscoverer = gpuDiscoverer;
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
|||
"but could not find any usable GPUs on the NodeManager!";
|
||||
LOG.error(message);
|
||||
// No gpu can be used by YARN.
|
||||
throw new YarnException(message);
|
||||
throwIfNecessary(new YarnException(message), conf);
|
||||
return;
|
||||
}
|
||||
|
||||
long nUsableGpus = usableGpus.size();
|
||||
|
@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
|||
Map<String, ResourceInformation> configuredResourceTypes =
|
||||
ResourceUtils.getResourceTypes();
|
||||
if (!configuredResourceTypes.containsKey(GPU_URI)) {
|
||||
throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
|
||||
LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
|
||||
+ GPU_URI
|
||||
+ " resource-type is not configured inside"
|
||||
+ " resource-types.xml, please configure it to enable GPU feature or"
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
|
@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
|
|||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
||||
|
||||
import java.util.List;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
|||
|
||||
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
||||
private final GpuDiscoverer gpuDiscoverer;
|
||||
public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||
|
||||
private int numOfErrorExecutionSinceLastSucceed = 0;
|
||||
|
||||
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
||||
private DockerCommandPlugin dockerCommandPlugin = null;
|
||||
|
||||
|
@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
|||
|
||||
@Override
|
||||
public void initialize(Context context) throws YarnException {
|
||||
this.gpuDiscoverer.initialize(context.getConf());
|
||||
this.gpuDiscoverer.initialize(context.getConf(),
|
||||
new NvidiaBinaryHelper());
|
||||
this.dockerCommandPlugin =
|
||||
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
|
||||
context.getConf());
|
||||
|
@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
|||
|
||||
@Override
|
||||
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
||||
GpuDeviceInformation gpuDeviceInformation =
|
||||
gpuDiscoverer.getGpuDeviceInformation();
|
||||
GpuDeviceInformation gpuDeviceInformation;
|
||||
|
||||
//At this point the gpu plugin is already enabled
|
||||
checkGpuResourceHandler();
|
||||
|
||||
checkErrorCount();
|
||||
try{
|
||||
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
} catch (YarnException e) {
|
||||
LOG.error(e.getMessage(), e);
|
||||
numOfErrorExecutionSinceLastSucceed++;
|
||||
throw e;
|
||||
}
|
||||
|
||||
GpuResourceAllocator gpuResourceAllocator =
|
||||
gpuResourceHandler.getGpuAllocator();
|
||||
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
|
||||
|
@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
|||
}
|
||||
}
|
||||
|
||||
private void checkErrorCount() throws YarnException {
|
||||
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
||||
String msg =
|
||||
"Failed to execute GPU device information detection script for "
|
||||
+ MAX_REPEATED_ERROR_ALLOWED
|
||||
+ " times, skip following executions.";
|
||||
LOG.error(msg);
|
||||
throw new YarnException(msg);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return GpuResourcePlugin.class.getName();
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
||||
|
||||
/**
|
||||
* Executes the "nvidia-smi" command and returns an object
|
||||
* based on its output.
|
||||
*
|
||||
*/
|
||||
public class NvidiaBinaryHelper {
|
||||
/**
|
||||
* command should not run more than 10 sec.
|
||||
*/
|
||||
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
||||
|
||||
/**
|
||||
* @param pathOfGpuBinary The path of the binary
|
||||
* @return the GpuDeviceInformation parsed from the nvidia-smi output
|
||||
* @throws IOException if the binary output is not readable
|
||||
* @throws YarnException if the pathOfGpuBinary is null,
|
||||
* or the output parse failed
|
||||
*/
|
||||
synchronized GpuDeviceInformation getGpuDeviceInformation(
|
||||
String pathOfGpuBinary) throws IOException, YarnException {
|
||||
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
||||
|
||||
if (pathOfGpuBinary == null) {
|
||||
throw new YarnException(
|
||||
"Failed to find GPU discovery executable, please double check "
|
||||
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
|
||||
}
|
||||
|
||||
String output = Shell.execCommand(new HashMap<>(),
|
||||
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
|
||||
return parser.parseXml(output);
|
||||
}
|
||||
}
|
|
@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
|
@ -116,11 +117,13 @@ public class TestGpuResourceHandler {
|
|||
@Rule
|
||||
public ExpectedException expected = ExpectedException.none();
|
||||
|
||||
private NvidiaBinaryHelper nvidiaBinaryHelper;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
createTestDataDirectory();
|
||||
|
||||
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
|
||||
nvidiaBinaryHelper = new NvidiaBinaryHelper();
|
||||
|
||||
mockCGroupsHandler = mock(CGroupsHandler.class);
|
||||
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
|
||||
|
@ -146,13 +149,14 @@ public class TestGpuResourceHandler {
|
|||
@After
|
||||
public void cleanupTestFiles() throws IOException {
|
||||
FileUtils.deleteDirectory(testDataDirectory);
|
||||
nvidiaBinaryHelper = new NvidiaBinaryHelper();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBootstrapWithRealGpuDiscoverer() throws Exception {
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
||||
|
@ -170,7 +174,7 @@ public class TestGpuResourceHandler {
|
|||
public void testBootstrapWithMockGpuDiscoverer() throws Exception {
|
||||
GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
|
||||
Configuration conf = new YarnConfiguration();
|
||||
mockDiscoverer.initialize(conf);
|
||||
mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||
|
||||
expected.expect(ResourceHandlerException.class);
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
@ -270,7 +274,7 @@ public class TestGpuResourceHandler {
|
|||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
|
||||
gpuDiscoverer = new GpuDiscoverer();
|
||||
gpuDiscoverer.initialize(conf);
|
||||
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||
Context nmContext = createMockNmContext(conf);
|
||||
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
|
||||
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
|
||||
|
@ -379,7 +383,7 @@ public class TestGpuResourceHandler {
|
|||
public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||
|
||||
try {
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
@ -460,7 +464,7 @@ public class TestGpuResourceHandler {
|
|||
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
|
||||
mockPrivilegedExecutor, gpuDiscoverer);
|
||||
|
||||
gpuDiscoverer.initialize(conf);
|
||||
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||
|
||||
gpuNULLStateResourceHandler.bootstrap(conf);
|
||||
verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);
|
||||
|
|
|
@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
|
|||
private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
|
||||
private static final String TEST_PARENT_DIR = new File("target/temp/" +
|
||||
TestGpuDiscoverer.class.getName()).getAbsolutePath();
|
||||
private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
|
||||
|
||||
@Rule
|
||||
public ExpectedException exception = ExpectedException.none();
|
||||
|
@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
|
|||
Configuration conf) throws YarnException {
|
||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
return discoverer;
|
||||
}
|
||||
|
||||
|
@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
|
|||
// test case 1, check default setting.
|
||||
Configuration conf = new Configuration(false);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
|
||||
assertNvidiaIsOnPath(discoverer);
|
||||
|
||||
// test case 2, check mandatory set path.
|
||||
File fakeBinary = setupFakeBinary(conf);
|
||||
discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
assertEquals(fakeBinary.getAbsolutePath(),
|
||||
discoverer.getPathOfGpuBinary());
|
||||
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
|
||||
|
@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
|
|||
// but binary doesn't exist so default path will be used.
|
||||
fakeBinary.delete();
|
||||
discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
assertEquals(DEFAULT_BINARY_NAME,
|
||||
discoverer.getPathOfGpuBinary());
|
||||
assertNvidiaIsOnPath(discoverer);
|
||||
|
@ -310,14 +311,14 @@ public class TestGpuDiscoverer {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testGpuDiscover() throws YarnException {
|
||||
public void testGpuDiscover() throws YarnException, IOException {
|
||||
// Since this is more of a performance unit test, only run if
|
||||
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
|
||||
Assume.assumeTrue(
|
||||
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
|
||||
Configuration conf = new Configuration(false);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
|
||||
|
||||
assertTrue(info.getGpus().size() > 0);
|
||||
|
@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
|
|||
Configuration conf = createConfigWithAllowedDevices("1:2");
|
||||
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
|
||||
assertEquals(1, usableGpuDevices.size());
|
||||
|
||||
|
@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
|
|||
public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
|
||||
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
|
||||
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
|
||||
assertEquals(4, usableGpuDevices.size());
|
||||
|
@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
|
|||
|
||||
exception.expect(GpuDeviceSpecificationException.class);
|
||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||
discoverer.initialize(conf);
|
||||
discoverer.initialize(conf, binaryHelper);
|
||||
discoverer.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
|
@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
|
|||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
|
||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||
try {
|
||||
plugin.initialize(conf);
|
||||
plugin.initialize(conf, binaryHelper);
|
||||
plugin.getGpusUsableByYarn();
|
||||
fail("Illegal format, should fail.");
|
||||
} catch (YarnException e) {
|
||||
|
@ -501,15 +502,15 @@ public class TestGpuDiscoverer {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testScriptNotCalled() throws YarnException {
|
||||
public void testScriptNotCalled() throws YarnException, IOException {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
|
||||
|
||||
GpuDiscoverer gpuSpy = spy(new GpuDiscoverer());
|
||||
|
||||
gpuSpy.initialize(conf);
|
||||
gpuSpy.initialize(conf, binaryHelper);
|
||||
gpuSpy.getGpusUsableByYarn();
|
||||
|
||||
verify(gpuSpy, never()).getGpuDeviceInformation();
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue