YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko
This commit is contained in:
parent
69255fa1b9
commit
6980f1740f
|
@ -1612,6 +1612,20 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String NM_RESOURCE_PLUGINS =
|
public static final String NM_RESOURCE_PLUGINS =
|
||||||
NM_PREFIX + "resource-plugins";
|
NM_PREFIX + "resource-plugins";
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Specifies whether the initialization of the Node Manager should continue
|
||||||
|
* if a certain device (GPU, FPGA, etc) was not found in the system. If set
|
||||||
|
* to "true", then an exception will be thrown if a device is missing or
|
||||||
|
* an error occurred during discovery.
|
||||||
|
*/
|
||||||
|
@Private
|
||||||
|
public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
|
||||||
|
NM_RESOURCE_PLUGINS + ".fail-fast";
|
||||||
|
|
||||||
|
@Private
|
||||||
|
public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prefix for gpu configurations. Work in progress: This configuration
|
* Prefix for gpu configurations. Work in progress: This configuration
|
||||||
* parameter may be changed/removed in the future.
|
* parameter may be changed/removed in the future.
|
||||||
|
|
|
@ -3798,6 +3798,17 @@
|
||||||
<value></value>
|
<value></value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>
|
||||||
|
Specifies whether the initialization of the Node Manager should continue
|
||||||
|
if a certain device (GPU, FPGA, etc) was not found in the system. If set
|
||||||
|
to "true", then an exception will be thrown if a device is missing or
|
||||||
|
an error occurred during discovery.
|
||||||
|
</description>
|
||||||
|
<name>yarn.nodemanager.resource-plugins.fail-fast</name>
|
||||||
|
<value></value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>
|
<description>
|
||||||
Specify GPU devices which can be managed by YARN NodeManager, split by comma
|
Specify GPU devices which can be managed by YARN NodeManager, split by comma
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
|
||||||
|
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Small utility class which only re-throws YarnException if
|
||||||
|
* NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public final class ResourcesExceptionUtil {
|
||||||
|
private ResourcesExceptionUtil() {}
|
||||||
|
|
||||||
|
public static void throwIfNecessary(YarnException e, Configuration conf)
|
||||||
|
throws YarnException {
|
||||||
|
if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
|
||||||
|
DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,6 +18,12 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -36,10 +42,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class GpuResourceHandlerImpl implements ResourceHandler {
|
public class GpuResourceHandlerImpl implements ResourceHandler {
|
||||||
final static Log LOG = LogFactory
|
final static Log LOG = LogFactory
|
||||||
.getLog(GpuResourceHandlerImpl.class);
|
.getLog(GpuResourceHandlerImpl.class);
|
||||||
|
@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
||||||
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
||||||
+ "any usable GPU devices, please double check configuration!";
|
+ "any usable GPU devices, please double check configuration!";
|
||||||
LOG.error(message);
|
LOG.error(message);
|
||||||
throw new ResourceHandlerException(message);
|
throwIfNecessary(new ResourceHandlerException(message),
|
||||||
|
configuration);
|
||||||
}
|
}
|
||||||
} catch (YarnException e) {
|
} catch (YarnException e) {
|
||||||
LOG.error("Exception when trying to get usable GPU device", e);
|
LOG.error("Exception when trying to get usable GPU device", e);
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class ResourcePluginManager {
|
||||||
|
|
||||||
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
|
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
|
||||||
if (plugins != null) {
|
if (plugins != null) {
|
||||||
pluginMap = initializePlugins(context, plugins);
|
pluginMap = initializePlugins(conf, context, plugins);
|
||||||
}
|
}
|
||||||
|
|
||||||
configuredPlugins = Collections.unmodifiableMap(pluginMap);
|
configuredPlugins = Collections.unmodifiableMap(pluginMap);
|
||||||
|
@ -77,8 +77,7 @@ public class ResourcePluginManager {
|
||||||
return plugins;
|
return plugins;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
|
||||||
private Map<String, ResourcePlugin> initializePlugins(
|
|
||||||
Context context, String[] plugins) throws YarnException {
|
Context context, String[] plugins) throws YarnException {
|
||||||
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
|
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
|
||||||
|
|
||||||
|
@ -91,7 +90,7 @@ public class ResourcePluginManager {
|
||||||
if (resourceName.equals(GPU_URI)) {
|
if (resourceName.equals(GPU_URI)) {
|
||||||
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
|
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
|
||||||
final GpuNodeResourceUpdateHandler updateHandler =
|
final GpuNodeResourceUpdateHandler updateHandler =
|
||||||
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
|
new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
|
||||||
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
|
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
|
||||||
} else if (resourceName.equals(FPGA_URI)) {
|
} else if (resourceName.equals(FPGA_URI)) {
|
||||||
plugin = new FpgaResourcePlugin();
|
plugin = new FpgaResourcePlugin();
|
||||||
|
|
|
@ -18,21 +18,7 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
|
||||||
import com.google.common.collect.ImmutableSet;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
|
||||||
import org.apache.hadoop.classification.InterfaceStability;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.util.Shell;
|
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -42,6 +28,22 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import com.google.common.collect.ImmutableSet;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
|
||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
@InterfaceStability.Unstable
|
@InterfaceStability.Unstable
|
||||||
|
@ -57,11 +59,10 @@ public class GpuDiscoverer {
|
||||||
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
|
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
|
||||||
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
|
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
|
||||||
|
|
||||||
// command should not run more than 10 sec.
|
|
||||||
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
|
||||||
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||||
|
|
||||||
private Configuration conf = null;
|
private Configuration conf = null;
|
||||||
|
private NvidiaBinaryHelper nvidiaBinaryHelper;
|
||||||
private String pathOfGpuBinary = null;
|
private String pathOfGpuBinary = null;
|
||||||
private Map<String, String> environment = new HashMap<>();
|
private Map<String, String> environment = new HashMap<>();
|
||||||
|
|
||||||
|
@ -110,24 +111,17 @@ public class GpuDiscoverer {
|
||||||
* @return GpuDeviceInformation
|
* @return GpuDeviceInformation
|
||||||
* @throws YarnException when any error happens
|
* @throws YarnException when any error happens
|
||||||
*/
|
*/
|
||||||
synchronized GpuDeviceInformation getGpuDeviceInformation()
|
public synchronized GpuDeviceInformation getGpuDeviceInformation()
|
||||||
throws YarnException {
|
throws YarnException {
|
||||||
validateConfOrThrowException();
|
|
||||||
|
|
||||||
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
||||||
String msg = getErrorMessageOfScriptExecutionThresholdReached();
|
String msg = getErrorMessageOfScriptExecutionThresholdReached();
|
||||||
LOG.error(msg);
|
LOG.error(msg);
|
||||||
throw new YarnException(msg);
|
throw new YarnException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
String output;
|
|
||||||
try {
|
try {
|
||||||
output = Shell.execCommand(environment,
|
lastDiscoveredGpuInformation =
|
||||||
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
|
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
|
||||||
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
|
||||||
lastDiscoveredGpuInformation = parser.parseXml(output);
|
|
||||||
numOfErrorExecutionSinceLastSucceed = 0;
|
|
||||||
return lastDiscoveredGpuInformation;
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
numOfErrorExecutionSinceLastSucceed++;
|
numOfErrorExecutionSinceLastSucceed++;
|
||||||
String msg = getErrorMessageOfScriptExecution(e.getMessage());
|
String msg = getErrorMessageOfScriptExecution(e.getMessage());
|
||||||
|
@ -138,17 +132,18 @@ public class GpuDiscoverer {
|
||||||
} catch (YarnException e) {
|
} catch (YarnException e) {
|
||||||
numOfErrorExecutionSinceLastSucceed++;
|
numOfErrorExecutionSinceLastSucceed++;
|
||||||
String msg = getFailedToParseErrorMessage(e.getMessage());
|
String msg = getFailedToParseErrorMessage(e.getMessage());
|
||||||
if (LOG.isDebugEnabled()) {
|
LOG.debug(msg, e);
|
||||||
LOG.warn(msg, e);
|
|
||||||
}
|
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return lastDiscoveredGpuInformation;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean IsAutoDiscoveryEnabled() {
|
private boolean isAutoDiscoveryEnabled() {
|
||||||
String allowedDevicesStr = conf.get(
|
String allowedDevicesStr = conf.get(
|
||||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||||
|
|
||||||
return allowedDevicesStr.equals(
|
return allowedDevicesStr.equals(
|
||||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||||
}
|
}
|
||||||
|
@ -157,13 +152,12 @@ public class GpuDiscoverer {
|
||||||
* Get list of GPU devices usable by YARN.
|
* Get list of GPU devices usable by YARN.
|
||||||
*
|
*
|
||||||
* @return List of GPU devices
|
* @return List of GPU devices
|
||||||
* @throws YarnException when any issue happens
|
|
||||||
*/
|
*/
|
||||||
public synchronized List<GpuDevice> getGpusUsableByYarn()
|
public synchronized List<GpuDevice> getGpusUsableByYarn()
|
||||||
throws YarnException {
|
throws YarnException {
|
||||||
validateConfOrThrowException();
|
validateConfOrThrowException();
|
||||||
|
|
||||||
if (IsAutoDiscoveryEnabled()) {
|
if (isAutoDiscoveryEnabled()) {
|
||||||
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
|
||||||
} else {
|
} else {
|
||||||
if (gpuDevicesFromUser == null) {
|
if (gpuDevicesFromUser == null) {
|
||||||
|
@ -219,16 +213,27 @@ public class GpuDiscoverer {
|
||||||
if (device.trim().length() > 0) {
|
if (device.trim().length() > 0) {
|
||||||
String[] splitByColon = device.trim().split(":");
|
String[] splitByColon = device.trim().split(":");
|
||||||
if (splitByColon.length != 2) {
|
if (splitByColon.length != 2) {
|
||||||
throw GpuDeviceSpecificationException.
|
throwIfNecessary(GpuDeviceSpecificationException
|
||||||
createWithWrongValueSpecified(device, devices);
|
.createWithWrongValueSpecified(device, devices), conf);
|
||||||
|
LOG.warn("Wrong GPU specification string {}, ignored", device);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuDevice gpuDevice;
|
||||||
|
try {
|
||||||
|
gpuDevice = parseGpuDevice(splitByColon);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
throwIfNecessary(GpuDeviceSpecificationException
|
||||||
|
.createWithWrongValueSpecified(device, devices, e), conf);
|
||||||
|
LOG.warn("Cannot parse GPU device numbers: {}", device);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
|
|
||||||
if (!gpuDevices.contains(gpuDevice)) {
|
if (!gpuDevices.contains(gpuDevice)) {
|
||||||
gpuDevices.add(gpuDevice);
|
gpuDevices.add(gpuDevice);
|
||||||
} else {
|
} else {
|
||||||
throw GpuDeviceSpecificationException
|
throwIfNecessary(GpuDeviceSpecificationException
|
||||||
.createWithDuplicateValueSpecified(device, devices);
|
.createWithDuplicateValueSpecified(device, devices), conf);
|
||||||
|
LOG.warn("CPU device is duplicated: {}", device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -237,22 +242,18 @@ public class GpuDiscoverer {
|
||||||
return gpuDevices;
|
return gpuDevices;
|
||||||
}
|
}
|
||||||
|
|
||||||
private GpuDevice parseGpuDevice(String device, String[] splitByColon,
|
private GpuDevice parseGpuDevice(String[] splitByColon) {
|
||||||
String allowedDevicesStr) throws YarnException {
|
int index = Integer.parseInt(splitByColon[0]);
|
||||||
try {
|
int minorNumber = Integer.parseInt(splitByColon[1]);
|
||||||
int index = Integer.parseInt(splitByColon[0]);
|
return new GpuDevice(index, minorNumber);
|
||||||
int minorNumber = Integer.parseInt(splitByColon[1]);
|
|
||||||
return new GpuDevice(index, minorNumber);
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
throw GpuDeviceSpecificationException.
|
|
||||||
createWithWrongValueSpecified(device, allowedDevicesStr, e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void initialize(Configuration config)
|
|
||||||
throws YarnException {
|
public synchronized void initialize(Configuration config,
|
||||||
|
NvidiaBinaryHelper nvidiaHelper) throws YarnException {
|
||||||
this.conf = config;
|
this.conf = config;
|
||||||
if (IsAutoDiscoveryEnabled()) {
|
this.nvidiaBinaryHelper = nvidiaHelper;
|
||||||
|
if (isAutoDiscoveryEnabled()) {
|
||||||
numOfErrorExecutionSinceLastSucceed = 0;
|
numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
lookUpAutoDiscoveryBinary(config);
|
lookUpAutoDiscoveryBinary(config);
|
||||||
|
|
||||||
|
@ -286,7 +287,18 @@ public class GpuDiscoverer {
|
||||||
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
|
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
|
||||||
} else {
|
} else {
|
||||||
binaryPath = configuredBinaryFile;
|
binaryPath = configuredBinaryFile;
|
||||||
|
// If path exists but file name is incorrect don't execute the file
|
||||||
|
String fileName = binaryPath.getName();
|
||||||
|
if (DEFAULT_BINARY_NAME.equals(fileName)) {
|
||||||
|
String msg = String.format("Please check the configuration value of"
|
||||||
|
+" %s. It should point to an %s binary.",
|
||||||
|
YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
||||||
|
DEFAULT_BINARY_NAME);
|
||||||
|
throwIfNecessary(new YarnException(msg), config);
|
||||||
|
LOG.warn(msg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,9 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||||
private static final Logger LOG =
|
private static final Logger LOG =
|
||||||
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
||||||
private final GpuDiscoverer gpuDiscoverer;
|
private final GpuDiscoverer gpuDiscoverer;
|
||||||
|
private Configuration conf;
|
||||||
|
|
||||||
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
|
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
|
||||||
|
Configuration conf) {
|
||||||
this.gpuDiscoverer = gpuDiscoverer;
|
this.gpuDiscoverer = gpuDiscoverer;
|
||||||
|
this.conf = conf;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||||
"but could not find any usable GPUs on the NodeManager!";
|
"but could not find any usable GPUs on the NodeManager!";
|
||||||
LOG.error(message);
|
LOG.error(message);
|
||||||
// No gpu can be used by YARN.
|
// No gpu can be used by YARN.
|
||||||
throw new YarnException(message);
|
throwIfNecessary(new YarnException(message), conf);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
long nUsableGpus = usableGpus.size();
|
long nUsableGpus = usableGpus.size();
|
||||||
|
@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||||
Map<String, ResourceInformation> configuredResourceTypes =
|
Map<String, ResourceInformation> configuredResourceTypes =
|
||||||
ResourceUtils.getResourceTypes();
|
ResourceUtils.getResourceTypes();
|
||||||
if (!configuredResourceTypes.containsKey(GPU_URI)) {
|
if (!configuredResourceTypes.containsKey(GPU_URI)) {
|
||||||
throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
|
LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
|
||||||
+ GPU_URI
|
+ GPU_URI
|
||||||
+ " resource-type is not configured inside"
|
+ " resource-type is not configured inside"
|
||||||
+ " resource-types.xml, please configure it to enable GPU feature or"
|
+ " resource-types.xml, please configure it to enable GPU feature or"
|
||||||
|
|
|
@ -18,6 +18,8 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
|
@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
||||||
|
|
||||||
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
|
||||||
private final GpuDiscoverer gpuDiscoverer;
|
private final GpuDiscoverer gpuDiscoverer;
|
||||||
|
public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||||
|
|
||||||
|
private int numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
|
|
||||||
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
private GpuResourceHandlerImpl gpuResourceHandler = null;
|
||||||
private DockerCommandPlugin dockerCommandPlugin = null;
|
private DockerCommandPlugin dockerCommandPlugin = null;
|
||||||
|
|
||||||
|
@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize(Context context) throws YarnException {
|
public void initialize(Context context) throws YarnException {
|
||||||
this.gpuDiscoverer.initialize(context.getConf());
|
this.gpuDiscoverer.initialize(context.getConf(),
|
||||||
|
new NvidiaBinaryHelper());
|
||||||
this.dockerCommandPlugin =
|
this.dockerCommandPlugin =
|
||||||
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
|
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
|
||||||
context.getConf());
|
context.getConf());
|
||||||
|
@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
|
||||||
GpuDeviceInformation gpuDeviceInformation =
|
GpuDeviceInformation gpuDeviceInformation;
|
||||||
gpuDiscoverer.getGpuDeviceInformation();
|
|
||||||
|
|
||||||
//At this point the gpu plugin is already enabled
|
//At this point the gpu plugin is already enabled
|
||||||
checkGpuResourceHandler();
|
checkGpuResourceHandler();
|
||||||
|
|
||||||
|
checkErrorCount();
|
||||||
|
try{
|
||||||
|
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
|
||||||
|
numOfErrorExecutionSinceLastSucceed = 0;
|
||||||
|
} catch (YarnException e) {
|
||||||
|
LOG.error(e.getMessage(), e);
|
||||||
|
numOfErrorExecutionSinceLastSucceed++;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
GpuResourceAllocator gpuResourceAllocator =
|
GpuResourceAllocator gpuResourceAllocator =
|
||||||
gpuResourceHandler.getGpuAllocator();
|
gpuResourceHandler.getGpuAllocator();
|
||||||
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
|
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
|
||||||
|
@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void checkErrorCount() throws YarnException {
|
||||||
|
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
||||||
|
String msg =
|
||||||
|
"Failed to execute GPU device information detection script for "
|
||||||
|
+ MAX_REPEATED_ERROR_ALLOWED
|
||||||
|
+ " times, skip following executions.";
|
||||||
|
LOG.error(msg);
|
||||||
|
throw new YarnException(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return GpuResourcePlugin.class.getName();
|
return GpuResourcePlugin.class.getName();
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import org.apache.hadoop.util.Shell;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes the "nvidia-smi" command and returns an object
|
||||||
|
* based on its output.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class NvidiaBinaryHelper {
|
||||||
|
/**
|
||||||
|
* command should not run more than 10 sec.
|
||||||
|
*/
|
||||||
|
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param pathOfGpuBinary The path of the binary
|
||||||
|
* @return the GpuDeviceInformation parsed from the nvidia-smi output
|
||||||
|
* @throws IOException if the binary output is not readable
|
||||||
|
* @throws YarnException if the pathOfGpuBinary is null,
|
||||||
|
* or the output parse failed
|
||||||
|
*/
|
||||||
|
synchronized GpuDeviceInformation getGpuDeviceInformation(
|
||||||
|
String pathOfGpuBinary) throws IOException, YarnException {
|
||||||
|
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
||||||
|
|
||||||
|
if (pathOfGpuBinary == null) {
|
||||||
|
throw new YarnException(
|
||||||
|
"Failed to find GPU discovery executable, please double check "
|
||||||
|
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
|
||||||
|
}
|
||||||
|
|
||||||
|
String output = Shell.execCommand(new HashMap<>(),
|
||||||
|
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
|
||||||
|
return parser.parseXml(output);
|
||||||
|
}
|
||||||
|
}
|
|
@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||||
|
@ -116,11 +117,13 @@ public class TestGpuResourceHandler {
|
||||||
@Rule
|
@Rule
|
||||||
public ExpectedException expected = ExpectedException.none();
|
public ExpectedException expected = ExpectedException.none();
|
||||||
|
|
||||||
|
private NvidiaBinaryHelper nvidiaBinaryHelper;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws IOException {
|
public void setup() throws IOException {
|
||||||
createTestDataDirectory();
|
createTestDataDirectory();
|
||||||
|
|
||||||
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
|
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
|
||||||
|
nvidiaBinaryHelper = new NvidiaBinaryHelper();
|
||||||
|
|
||||||
mockCGroupsHandler = mock(CGroupsHandler.class);
|
mockCGroupsHandler = mock(CGroupsHandler.class);
|
||||||
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
|
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
|
||||||
|
@ -146,13 +149,14 @@ public class TestGpuResourceHandler {
|
||||||
@After
|
@After
|
||||||
public void cleanupTestFiles() throws IOException {
|
public void cleanupTestFiles() throws IOException {
|
||||||
FileUtils.deleteDirectory(testDataDirectory);
|
FileUtils.deleteDirectory(testDataDirectory);
|
||||||
|
nvidiaBinaryHelper = new NvidiaBinaryHelper();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBootstrapWithRealGpuDiscoverer() throws Exception {
|
public void testBootstrapWithRealGpuDiscoverer() throws Exception {
|
||||||
Configuration conf = createDefaultConfig();
|
Configuration conf = createDefaultConfig();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||||
|
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
|
|
||||||
|
@ -170,7 +174,7 @@ public class TestGpuResourceHandler {
|
||||||
public void testBootstrapWithMockGpuDiscoverer() throws Exception {
|
public void testBootstrapWithMockGpuDiscoverer() throws Exception {
|
||||||
GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
|
GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
|
||||||
Configuration conf = new YarnConfiguration();
|
Configuration conf = new YarnConfiguration();
|
||||||
mockDiscoverer.initialize(conf);
|
mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||||
|
|
||||||
expected.expect(ResourceHandlerException.class);
|
expected.expect(ResourceHandlerException.class);
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
|
@ -270,7 +274,7 @@ public class TestGpuResourceHandler {
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||||
|
|
||||||
gpuDiscoverer = new GpuDiscoverer();
|
gpuDiscoverer = new GpuDiscoverer();
|
||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||||
Context nmContext = createMockNmContext(conf);
|
Context nmContext = createMockNmContext(conf);
|
||||||
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
|
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
|
||||||
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
|
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
|
||||||
|
@ -379,7 +383,7 @@ public class TestGpuResourceHandler {
|
||||||
public void testAllocationWithoutAllowedGpus() throws Exception {
|
public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||||
Configuration conf = createDefaultConfig();
|
Configuration conf = createDefaultConfig();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
gpuResourceHandler.bootstrap(conf);
|
gpuResourceHandler.bootstrap(conf);
|
||||||
|
@ -460,7 +464,7 @@ public class TestGpuResourceHandler {
|
||||||
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
|
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
|
||||||
mockPrivilegedExecutor, gpuDiscoverer);
|
mockPrivilegedExecutor, gpuDiscoverer);
|
||||||
|
|
||||||
gpuDiscoverer.initialize(conf);
|
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
|
||||||
|
|
||||||
gpuNULLStateResourceHandler.bootstrap(conf);
|
gpuNULLStateResourceHandler.bootstrap(conf);
|
||||||
verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);
|
verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);
|
||||||
|
|
|
@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
|
||||||
private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
|
private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
|
||||||
private static final String TEST_PARENT_DIR = new File("target/temp/" +
|
private static final String TEST_PARENT_DIR = new File("target/temp/" +
|
||||||
TestGpuDiscoverer.class.getName()).getAbsolutePath();
|
TestGpuDiscoverer.class.getName()).getAbsolutePath();
|
||||||
|
private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
|
||||||
|
|
||||||
@Rule
|
@Rule
|
||||||
public ExpectedException exception = ExpectedException.none();
|
public ExpectedException exception = ExpectedException.none();
|
||||||
|
@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
|
||||||
Configuration conf) throws YarnException {
|
Configuration conf) throws YarnException {
|
||||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
|
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
return discoverer;
|
return discoverer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
|
||||||
// test case 1, check default setting.
|
// test case 1, check default setting.
|
||||||
Configuration conf = new Configuration(false);
|
Configuration conf = new Configuration(false);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
|
assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
|
||||||
assertNvidiaIsOnPath(discoverer);
|
assertNvidiaIsOnPath(discoverer);
|
||||||
|
|
||||||
// test case 2, check mandatory set path.
|
// test case 2, check mandatory set path.
|
||||||
File fakeBinary = setupFakeBinary(conf);
|
File fakeBinary = setupFakeBinary(conf);
|
||||||
discoverer = new GpuDiscoverer();
|
discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
assertEquals(fakeBinary.getAbsolutePath(),
|
assertEquals(fakeBinary.getAbsolutePath(),
|
||||||
discoverer.getPathOfGpuBinary());
|
discoverer.getPathOfGpuBinary());
|
||||||
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
|
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
|
||||||
|
@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
|
||||||
// but binary doesn't exist so default path will be used.
|
// but binary doesn't exist so default path will be used.
|
||||||
fakeBinary.delete();
|
fakeBinary.delete();
|
||||||
discoverer = new GpuDiscoverer();
|
discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
assertEquals(DEFAULT_BINARY_NAME,
|
assertEquals(DEFAULT_BINARY_NAME,
|
||||||
discoverer.getPathOfGpuBinary());
|
discoverer.getPathOfGpuBinary());
|
||||||
assertNvidiaIsOnPath(discoverer);
|
assertNvidiaIsOnPath(discoverer);
|
||||||
|
@ -310,14 +311,14 @@ public class TestGpuDiscoverer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGpuDiscover() throws YarnException {
|
public void testGpuDiscover() throws YarnException, IOException {
|
||||||
// Since this is more of a performance unit test, only run if
|
// Since this is more of a performance unit test, only run if
|
||||||
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
|
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
|
||||||
Assume.assumeTrue(
|
Assume.assumeTrue(
|
||||||
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
|
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
|
||||||
Configuration conf = new Configuration(false);
|
Configuration conf = new Configuration(false);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
|
GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
|
||||||
|
|
||||||
assertTrue(info.getGpus().size() > 0);
|
assertTrue(info.getGpus().size() > 0);
|
||||||
|
@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
|
||||||
Configuration conf = createConfigWithAllowedDevices("1:2");
|
Configuration conf = createConfigWithAllowedDevices("1:2");
|
||||||
|
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
|
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
|
||||||
assertEquals(1, usableGpuDevices.size());
|
assertEquals(1, usableGpuDevices.size());
|
||||||
|
|
||||||
|
@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
|
||||||
public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
|
public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
|
||||||
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
|
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
|
|
||||||
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
|
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
|
||||||
assertEquals(4, usableGpuDevices.size());
|
assertEquals(4, usableGpuDevices.size());
|
||||||
|
@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
|
||||||
|
|
||||||
exception.expect(GpuDeviceSpecificationException.class);
|
exception.expect(GpuDeviceSpecificationException.class);
|
||||||
GpuDiscoverer discoverer = new GpuDiscoverer();
|
GpuDiscoverer discoverer = new GpuDiscoverer();
|
||||||
discoverer.initialize(conf);
|
discoverer.initialize(conf, binaryHelper);
|
||||||
discoverer.getGpusUsableByYarn();
|
discoverer.getGpusUsableByYarn();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
|
||||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
|
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
|
||||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||||
try {
|
try {
|
||||||
plugin.initialize(conf);
|
plugin.initialize(conf, binaryHelper);
|
||||||
plugin.getGpusUsableByYarn();
|
plugin.getGpusUsableByYarn();
|
||||||
fail("Illegal format, should fail.");
|
fail("Illegal format, should fail.");
|
||||||
} catch (YarnException e) {
|
} catch (YarnException e) {
|
||||||
|
@ -501,13 +502,13 @@ public class TestGpuDiscoverer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testScriptNotCalled() throws YarnException {
|
public void testScriptNotCalled() throws YarnException, IOException {
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
|
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
|
||||||
|
|
||||||
GpuDiscoverer gpuSpy = spy(new GpuDiscoverer());
|
GpuDiscoverer gpuSpy = spy(new GpuDiscoverer());
|
||||||
|
|
||||||
gpuSpy.initialize(conf);
|
gpuSpy.initialize(conf, binaryHelper);
|
||||||
gpuSpy.getGpusUsableByYarn();
|
gpuSpy.getGpusUsableByYarn();
|
||||||
|
|
||||||
verify(gpuSpy, never()).getGpuDeviceInformation();
|
verify(gpuSpy, never()).getGpuDeviceInformation();
|
||||||
|
|
Loading…
Reference in New Issue