YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko

This commit is contained in:
Szilard Nemeth 2019-08-21 16:49:34 +02:00
parent 69255fa1b9
commit 6980f1740f
11 changed files with 278 additions and 97 deletions

View File

@ -1612,6 +1612,20 @@ public class YarnConfiguration extends Configuration {
public static final String NM_RESOURCE_PLUGINS =
NM_PREFIX + "resource-plugins";
/**
* Specifies whether the initialization of the Node Manager should continue
* if a certain device (GPU, FPGA, etc) was not found in the system. If set
* to "true", then an exception will be thrown if a device is missing or
* an error occurred during discovery.
*/
@Private
public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
NM_RESOURCE_PLUGINS + ".fail-fast";
@Private
public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
/**
* Prefix for gpu configurations. Work in progress: This configuration
* parameter may be changed/removed in the future.

View File

@ -3798,6 +3798,17 @@
<value></value>
</property>
<property>
<description>
Specifies whether the initialization of the Node Manager should continue
if a certain device (GPU, FPGA, etc) was not found in the system. If set
to "true", then an exception will be thrown if a device is missing or
an error occurred during discovery.
</description>
<name>yarn.nodemanager.resource-plugins.fail-fast</name>
<value></value>
</property>
<property>
<description>
Specify GPU devices which can be managed by YARN NodeManager, split by comma

View File

@ -0,0 +1,42 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.exceptions.YarnException;
/**
* Small utility class which only re-throws YarnException if
* NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
*
*/
public final class ResourcesExceptionUtil {
private ResourcesExceptionUtil() {}
public static void throwIfNecessary(YarnException e, Configuration conf)
throws YarnException {
if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
throw e;
}
}
}

View File

@ -18,6 +18,12 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@ -36,10 +42,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class GpuResourceHandlerImpl implements ResourceHandler {
final static Log LOG = LogFactory
.getLog(GpuResourceHandlerImpl.class);
@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
String message = "GPU is enabled on the NodeManager, but couldn't find "
+ "any usable GPU devices, please double check configuration!";
LOG.error(message);
throw new ResourceHandlerException(message);
throwIfNecessary(new ResourceHandlerException(message),
configuration);
}
} catch (YarnException e) {
LOG.error("Exception when trying to get usable GPU device", e);

View File

@ -60,7 +60,7 @@ public class ResourcePluginManager {
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
if (plugins != null) {
pluginMap = initializePlugins(context, plugins);
pluginMap = initializePlugins(conf, context, plugins);
}
configuredPlugins = Collections.unmodifiableMap(pluginMap);
@ -77,8 +77,7 @@ public class ResourcePluginManager {
return plugins;
}
private Map<String, ResourcePlugin> initializePlugins(
private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
Context context, String[] plugins) throws YarnException {
Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
@ -91,7 +90,7 @@ public class ResourcePluginManager {
if (resourceName.equals(GPU_URI)) {
final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
final GpuNodeResourceUpdateHandler updateHandler =
new GpuNodeResourceUpdateHandler(gpuDiscoverer);
new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
} else if (resourceName.equals(FPGA_URI)) {
plugin = new FpgaResourcePlugin();

View File

@ -18,21 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import java.io.File;
import java.io.IOException;
@ -42,6 +28,22 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
@InterfaceAudience.Private
@InterfaceStability.Unstable
@ -57,11 +59,10 @@ public class GpuDiscoverer {
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
// command should not run more than 10 sec.
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private Configuration conf = null;
private NvidiaBinaryHelper nvidiaBinaryHelper;
private String pathOfGpuBinary = null;
private Map<String, String> environment = new HashMap<>();
@ -110,24 +111,17 @@ public class GpuDiscoverer {
* @return GpuDeviceInformation
* @throws YarnException when any error happens
*/
synchronized GpuDeviceInformation getGpuDeviceInformation()
public synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException {
validateConfOrThrowException();
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg = getErrorMessageOfScriptExecutionThresholdReached();
LOG.error(msg);
throw new YarnException(msg);
}
String output;
try {
output = Shell.execCommand(environment,
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
lastDiscoveredGpuInformation = parser.parseXml(output);
numOfErrorExecutionSinceLastSucceed = 0;
return lastDiscoveredGpuInformation;
lastDiscoveredGpuInformation =
nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = getErrorMessageOfScriptExecution(e.getMessage());
@ -138,17 +132,18 @@ public class GpuDiscoverer {
} catch (YarnException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = getFailedToParseErrorMessage(e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.warn(msg, e);
}
LOG.debug(msg, e);
throw e;
}
return lastDiscoveredGpuInformation;
}
private boolean IsAutoDiscoveryEnabled() {
private boolean isAutoDiscoveryEnabled() {
String allowedDevicesStr = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
return allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
}
@ -157,13 +152,12 @@ public class GpuDiscoverer {
* Get list of GPU devices usable by YARN.
*
* @return List of GPU devices
* @throws YarnException when any issue happens
*/
public synchronized List<GpuDevice> getGpusUsableByYarn()
throws YarnException {
validateConfOrThrowException();
if (IsAutoDiscoveryEnabled()) {
if (isAutoDiscoveryEnabled()) {
return parseGpuDevicesFromAutoDiscoveredGpuInfo();
} else {
if (gpuDevicesFromUser == null) {
@ -219,16 +213,27 @@ public class GpuDiscoverer {
if (device.trim().length() > 0) {
String[] splitByColon = device.trim().split(":");
if (splitByColon.length != 2) {
throw GpuDeviceSpecificationException.
createWithWrongValueSpecified(device, devices);
throwIfNecessary(GpuDeviceSpecificationException
.createWithWrongValueSpecified(device, devices), conf);
LOG.warn("Wrong GPU specification string {}, ignored", device);
}
GpuDevice gpuDevice;
try {
gpuDevice = parseGpuDevice(splitByColon);
} catch (NumberFormatException e) {
throwIfNecessary(GpuDeviceSpecificationException
.createWithWrongValueSpecified(device, devices, e), conf);
LOG.warn("Cannot parse GPU device numbers: {}", device);
continue;
}
GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
if (!gpuDevices.contains(gpuDevice)) {
gpuDevices.add(gpuDevice);
} else {
throw GpuDeviceSpecificationException
.createWithDuplicateValueSpecified(device, devices);
throwIfNecessary(GpuDeviceSpecificationException
.createWithDuplicateValueSpecified(device, devices), conf);
LOG.warn("CPU device is duplicated: {}", device);
}
}
}
@ -237,22 +242,18 @@ public class GpuDiscoverer {
return gpuDevices;
}
private GpuDevice parseGpuDevice(String device, String[] splitByColon,
String allowedDevicesStr) throws YarnException {
try {
private GpuDevice parseGpuDevice(String[] splitByColon) {
int index = Integer.parseInt(splitByColon[0]);
int minorNumber = Integer.parseInt(splitByColon[1]);
return new GpuDevice(index, minorNumber);
} catch (NumberFormatException e) {
throw GpuDeviceSpecificationException.
createWithWrongValueSpecified(device, allowedDevicesStr, e);
}
}
public synchronized void initialize(Configuration config)
throws YarnException {
public synchronized void initialize(Configuration config,
NvidiaBinaryHelper nvidiaHelper) throws YarnException {
this.conf = config;
if (IsAutoDiscoveryEnabled()) {
this.nvidiaBinaryHelper = nvidiaHelper;
if (isAutoDiscoveryEnabled()) {
numOfErrorExecutionSinceLastSucceed = 0;
lookUpAutoDiscoveryBinary(config);
@ -286,7 +287,18 @@ public class GpuDiscoverer {
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
} else {
binaryPath = configuredBinaryFile;
// If path exists but file name is incorrect don't execute the file
String fileName = binaryPath.getName();
if (DEFAULT_BINARY_NAME.equals(fileName)) {
String msg = String.format("Please check the configuration value of"
+" %s. It should point to an %s binary.",
YarnConfiguration.NM_GPU_PATH_TO_EXEC,
DEFAULT_BINARY_NAME);
throwIfNecessary(new YarnException(msg), config);
LOG.warn(msg);
}
}
pathOfGpuBinary = binaryPath.getAbsolutePath();
}

View File

@ -18,6 +18,9 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
private static final Logger LOG =
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
private final GpuDiscoverer gpuDiscoverer;
private Configuration conf;
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
Configuration conf) {
this.gpuDiscoverer = gpuDiscoverer;
this.conf = conf;
}
@Override
@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
"but could not find any usable GPUs on the NodeManager!";
LOG.error(message);
// No gpu can be used by YARN.
throw new YarnException(message);
throwIfNecessary(new YarnException(message), conf);
return;
}
long nUsableGpus = usableGpus.size();
@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
Map<String, ResourceInformation> configuredResourceTypes =
ResourceUtils.getResourceTypes();
if (!configuredResourceTypes.containsKey(GPU_URI)) {
throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
+ GPU_URI
+ " resource-type is not configured inside"
+ " resource-types.xml, please configure it to enable GPU feature or"

View File

@ -18,6 +18,8 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import java.util.List;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
private final GpuDiscoverer gpuDiscoverer;
public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private int numOfErrorExecutionSinceLastSucceed = 0;
private GpuResourceHandlerImpl gpuResourceHandler = null;
private DockerCommandPlugin dockerCommandPlugin = null;
@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
@Override
public void initialize(Context context) throws YarnException {
this.gpuDiscoverer.initialize(context.getConf());
this.gpuDiscoverer.initialize(context.getConf(),
new NvidiaBinaryHelper());
this.dockerCommandPlugin =
GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
context.getConf());
@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
@Override
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
GpuDeviceInformation gpuDeviceInformation =
gpuDiscoverer.getGpuDeviceInformation();
GpuDeviceInformation gpuDeviceInformation;
//At this point the gpu plugin is already enabled
checkGpuResourceHandler();
checkErrorCount();
try{
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
numOfErrorExecutionSinceLastSucceed = 0;
} catch (YarnException e) {
LOG.error(e.getMessage(), e);
numOfErrorExecutionSinceLastSucceed++;
throw e;
}
GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
}
}
private void checkErrorCount() throws YarnException {
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg =
"Failed to execute GPU device information detection script for "
+ MAX_REPEATED_ERROR_ALLOWED
+ " times, skip following executions.";
LOG.error(msg);
throw new YarnException(msg);
}
}
@Override
public String toString() {
return GpuResourcePlugin.class.getName();

View File

@ -0,0 +1,63 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
/**
* Executes the "nvidia-smi" command and returns an object
* based on its output.
*
*/
public class NvidiaBinaryHelper {
/**
* command should not run more than 10 sec.
*/
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
/**
* @param pathOfGpuBinary The path of the binary
* @return the GpuDeviceInformation parsed from the nvidia-smi output
* @throws IOException if the binary output is not readable
* @throws YarnException if the pathOfGpuBinary is null,
* or the output parse failed
*/
synchronized GpuDeviceInformation getGpuDeviceInformation(
String pathOfGpuBinary) throws IOException, YarnException {
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
if (pathOfGpuBinary == null) {
throw new YarnException(
"Failed to find GPU discovery executable, please double check "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
}
String output = Shell.execCommand(new HashMap<>(),
new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
return parser.parseXml(output);
}
}

View File

@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@ -116,11 +117,13 @@ public class TestGpuResourceHandler {
@Rule
public ExpectedException expected = ExpectedException.none();
private NvidiaBinaryHelper nvidiaBinaryHelper;
@Before
public void setup() throws IOException {
createTestDataDirectory();
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
nvidiaBinaryHelper = new NvidiaBinaryHelper();
mockCGroupsHandler = mock(CGroupsHandler.class);
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
@ -146,13 +149,14 @@ public class TestGpuResourceHandler {
@After
public void cleanupTestFiles() throws IOException {
FileUtils.deleteDirectory(testDataDirectory);
nvidiaBinaryHelper = new NvidiaBinaryHelper();
}
@Test
public void testBootstrapWithRealGpuDiscoverer() throws Exception {
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
gpuDiscoverer.initialize(conf);
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
gpuResourceHandler.bootstrap(conf);
@ -170,7 +174,7 @@ public class TestGpuResourceHandler {
public void testBootstrapWithMockGpuDiscoverer() throws Exception {
GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
Configuration conf = new YarnConfiguration();
mockDiscoverer.initialize(conf);
mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
expected.expect(ResourceHandlerException.class);
gpuResourceHandler.bootstrap(conf);
@ -270,7 +274,7 @@ public class TestGpuResourceHandler {
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer = new GpuDiscoverer();
gpuDiscoverer.initialize(conf);
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
Context nmContext = createMockNmContext(conf);
gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
@ -379,7 +383,7 @@ public class TestGpuResourceHandler {
public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
gpuDiscoverer.initialize(conf);
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
try {
gpuResourceHandler.bootstrap(conf);
@ -460,7 +464,7 @@ public class TestGpuResourceHandler {
new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
mockPrivilegedExecutor, gpuDiscoverer);
gpuDiscoverer.initialize(conf);
gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
gpuNULLStateResourceHandler.bootstrap(conf);
verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);

View File

@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
private static final String TEST_PARENT_DIR = new File("target/temp/" +
TestGpuDiscoverer.class.getName()).getAbsolutePath();
private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
@Rule
public ExpectedException exception = ExpectedException.none();
@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
Configuration conf) throws YarnException {
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
return discoverer;
}
@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
// test case 1, check default setting.
Configuration conf = new Configuration(false);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
assertNvidiaIsOnPath(discoverer);
// test case 2, check mandatory set path.
File fakeBinary = setupFakeBinary(conf);
discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
assertEquals(fakeBinary.getAbsolutePath(),
discoverer.getPathOfGpuBinary());
assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
// but binary doesn't exist so default path will be used.
fakeBinary.delete();
discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
assertEquals(DEFAULT_BINARY_NAME,
discoverer.getPathOfGpuBinary());
assertNvidiaIsOnPath(discoverer);
@ -310,14 +311,14 @@ public class TestGpuDiscoverer {
}
@Test
public void testGpuDiscover() throws YarnException {
public void testGpuDiscover() throws YarnException, IOException {
// Since this is more of a performance unit test, only run if
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
Assume.assumeTrue(
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
Configuration conf = new Configuration(false);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
assertTrue(info.getGpus().size() > 0);
@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
Configuration conf = createConfigWithAllowedDevices("1:2");
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
assertEquals(1, usableGpuDevices.size());
@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
assertEquals(4, usableGpuDevices.size());
@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
exception.expect(GpuDeviceSpecificationException.class);
GpuDiscoverer discoverer = new GpuDiscoverer();
discoverer.initialize(conf);
discoverer.initialize(conf, binaryHelper);
discoverer.getGpusUsableByYarn();
}
@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
GpuDiscoverer plugin = new GpuDiscoverer();
try {
plugin.initialize(conf);
plugin.initialize(conf, binaryHelper);
plugin.getGpusUsableByYarn();
fail("Illegal format, should fail.");
} catch (YarnException e) {
@ -501,13 +502,13 @@ public class TestGpuDiscoverer {
}
@Test
public void testScriptNotCalled() throws YarnException {
public void testScriptNotCalled() throws YarnException, IOException {
Configuration conf = new Configuration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
GpuDiscoverer gpuSpy = spy(new GpuDiscoverer());
gpuSpy.initialize(conf);
gpuSpy.initialize(conf, binaryHelper);
gpuSpy.getGpusUsableByYarn();
verify(gpuSpy, never()).getGpuDeviceInformation();