YARN-8822. Nvidia-docker v2 support for YARN GPU feature. (Charo Zhang via Sunil Govindan)
This commit is contained in:
parent
af589262c8
commit
c968568c58
|
@ -16,6 +16,7 @@ feature.tc.enabled=false
|
||||||
# docker.privileged-containers.enabled=false
|
# docker.privileged-containers.enabled=false
|
||||||
# docker.allowed.volume-drivers=## comma seperated list of allowed volume-drivers
|
# docker.allowed.volume-drivers=## comma seperated list of allowed volume-drivers
|
||||||
# docker.no-new-privileges.enabled=## enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default
|
# docker.no-new-privileges.enabled=## enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default
|
||||||
|
# docker.allowed.runtimes=## comma seperated runtimes that can be used.
|
||||||
|
|
||||||
# The configs below deal with settings for FPGA resource
|
# The configs below deal with settings for FPGA resource
|
||||||
#[fpga]
|
#[fpga]
|
||||||
|
|
|
@ -1635,6 +1635,9 @@ public class YarnConfiguration extends Configuration {
|
||||||
@Private
|
@Private
|
||||||
public static final String NVIDIA_DOCKER_V1 = "nvidia-docker-v1";
|
public static final String NVIDIA_DOCKER_V1 = "nvidia-docker-v1";
|
||||||
|
|
||||||
|
@Private
|
||||||
|
public static final String NVIDIA_DOCKER_V2 = "nvidia-docker-v2";
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
public static final String DEFAULT_NM_GPU_DOCKER_PLUGIN_IMPL =
|
public static final String DEFAULT_NM_GPU_DOCKER_PLUGIN_IMPL =
|
||||||
NVIDIA_DOCKER_V1;
|
NVIDIA_DOCKER_V1;
|
||||||
|
|
|
@ -159,6 +159,11 @@ public class DockerRunCommand extends DockerCommand {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DockerRunCommand addRuntime(String runtime) {
|
||||||
|
super.addCommandArguments("runtime", runtime);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public DockerRunCommand groupAdd(String[] groups) {
|
public DockerRunCommand groupAdd(String[] groups) {
|
||||||
super.addCommandArguments("group-add", String.join(",", groups));
|
super.addCommandArguments("group-add", String.join(",", groups));
|
||||||
return this;
|
return this;
|
||||||
|
|
|
@ -34,6 +34,10 @@ public class GpuDockerCommandPluginFactory {
|
||||||
if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V1)) {
|
if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V1)) {
|
||||||
return new NvidiaDockerV1CommandPlugin(conf);
|
return new NvidiaDockerV1CommandPlugin(conf);
|
||||||
}
|
}
|
||||||
|
// nvidia-docker2
|
||||||
|
if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V2)) {
|
||||||
|
return new NvidiaDockerV2CommandPlugin();
|
||||||
|
}
|
||||||
|
|
||||||
throw new YarnException(
|
throw new YarnException(
|
||||||
"Unkown implementation name for Gpu docker plugin, impl=" + impl);
|
"Unkown implementation name for Gpu docker plugin, impl=" + impl);
|
||||||
|
|
|
@ -0,0 +1,111 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerVolumeCommand;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation to use nvidia-docker v2 as GPU docker command plugin.
|
||||||
|
*/
|
||||||
|
public class NvidiaDockerV2CommandPlugin implements DockerCommandPlugin {
|
||||||
|
final static Log LOG = LogFactory.getLog(NvidiaDockerV2CommandPlugin.class);
|
||||||
|
|
||||||
|
private String nvidiaRuntime = "nvidia";
|
||||||
|
private String nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES";
|
||||||
|
|
||||||
|
public NvidiaDockerV2CommandPlugin() {}
|
||||||
|
|
||||||
|
private Set<GpuDevice> getAssignedGpus(Container container) {
|
||||||
|
ResourceMappings resourceMappings = container.getResourceMappings();
|
||||||
|
|
||||||
|
// Copy of assigned Resources
|
||||||
|
Set<GpuDevice> assignedResources = null;
|
||||||
|
if (resourceMappings != null) {
|
||||||
|
assignedResources = new HashSet<>();
|
||||||
|
for (Serializable s : resourceMappings.getAssignedResources(
|
||||||
|
ResourceInformation.GPU_URI)) {
|
||||||
|
assignedResources.add((GpuDevice) s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (assignedResources == null || assignedResources.isEmpty()) {
|
||||||
|
// When no GPU resource assigned, don't need to update docker command.
|
||||||
|
return Collections.emptySet();
|
||||||
|
}
|
||||||
|
return assignedResources;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
protected boolean requestsGpu(Container container) {
|
||||||
|
return GpuResourceAllocator.getRequestedGpus(container.getResource()) > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void updateDockerRunCommand(
|
||||||
|
DockerRunCommand dockerRunCommand, Container container)
|
||||||
|
throws ContainerExecutionException {
|
||||||
|
if (!requestsGpu(container)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Set<GpuDevice> assignedResources = getAssignedGpus(container);
|
||||||
|
if (assignedResources == null || assignedResources.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Map<String, String> environment = new HashMap<>();
|
||||||
|
String gpuIndexList = "";
|
||||||
|
for (GpuDevice gpuDevice : assignedResources) {
|
||||||
|
gpuIndexList = gpuIndexList + gpuDevice.getIndex() + ",";
|
||||||
|
LOG.info("nvidia docker2 assigned gpu index: " + gpuDevice.getIndex());
|
||||||
|
}
|
||||||
|
dockerRunCommand.addRuntime(nvidiaRuntime);
|
||||||
|
environment.put(nvidiaVisibleDevices,
|
||||||
|
gpuIndexList.substring(0, gpuIndexList.length() - 1));
|
||||||
|
dockerRunCommand.addEnv(environment);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DockerVolumeCommand getCreateDockerVolumeCommand(Container container)
|
||||||
|
throws ContainerExecutionException {
|
||||||
|
// No Volume needed for nvidia-docker2.
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DockerVolumeCommand getCleanupDockerVolumesCommand(Container container)
|
||||||
|
throws ContainerExecutionException {
|
||||||
|
// No cleanup needed.
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -342,6 +342,8 @@ const char *get_docker_error_message(const int error_code) {
|
||||||
return "Docker image is not trusted";
|
return "Docker image is not trusted";
|
||||||
case INVALID_DOCKER_TMPFS_MOUNT:
|
case INVALID_DOCKER_TMPFS_MOUNT:
|
||||||
return "Invalid docker tmpfs mount";
|
return "Invalid docker tmpfs mount";
|
||||||
|
case INVALID_DOCKER_RUNTIME:
|
||||||
|
return "Invalid docker runtime";
|
||||||
default:
|
default:
|
||||||
return "Unknown error";
|
return "Unknown error";
|
||||||
}
|
}
|
||||||
|
@ -883,6 +885,19 @@ static int set_network(const struct configuration *command_config,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int set_runtime(const struct configuration *command_config,
|
||||||
|
const struct configuration *conf, args *args) {
|
||||||
|
int ret = 0;
|
||||||
|
ret = add_param_to_command_if_allowed(command_config, conf, "runtime",
|
||||||
|
"docker.allowed.runtimes", "--runtime=",
|
||||||
|
0, 0, args);
|
||||||
|
if (ret != 0) {
|
||||||
|
fprintf(ERRORFILE, "Could not find requested runtime in allowed runtimes\n");
|
||||||
|
ret = INVALID_DOCKER_RUNTIME;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static int set_pid_namespace(const struct configuration *command_config,
|
static int set_pid_namespace(const struct configuration *command_config,
|
||||||
const struct configuration *conf, args *args) {
|
const struct configuration *conf, args *args) {
|
||||||
char *value = get_configuration_value("pid", DOCKER_COMMAND_FILE_SECTION,
|
char *value = get_configuration_value("pid", DOCKER_COMMAND_FILE_SECTION,
|
||||||
|
@ -1527,6 +1542,11 @@ int get_docker_run_command(const char *command_file, const struct configuration
|
||||||
goto free_and_exit;
|
goto free_and_exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = set_runtime(&command_config, conf, args);
|
||||||
|
if (ret != 0) {
|
||||||
|
goto free_and_exit;
|
||||||
|
}
|
||||||
|
|
||||||
ret = set_hostname(&command_config, args);
|
ret = set_hostname(&command_config, args);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
goto free_and_exit;
|
goto free_and_exit;
|
||||||
|
|
|
@ -67,7 +67,8 @@ enum docker_error_codes {
|
||||||
PID_HOST_DISABLED,
|
PID_HOST_DISABLED,
|
||||||
INVALID_PID_NAMESPACE,
|
INVALID_PID_NAMESPACE,
|
||||||
INVALID_DOCKER_IMAGE_TRUST,
|
INVALID_DOCKER_IMAGE_TRUST,
|
||||||
INVALID_DOCKER_TMPFS_MOUNT
|
INVALID_DOCKER_TMPFS_MOUNT,
|
||||||
|
INVALID_DOCKER_RUNTIME
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -432,6 +432,68 @@ namespace ContainerExecutor {
|
||||||
run_docker_run_helper_function(file_cmd_vec, set_hostname);
|
run_docker_run_helper_function(file_cmd_vec, set_hostname);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(TestDockerUtil, test_set_runtime) {
|
||||||
|
struct configuration container_cfg;
|
||||||
|
struct args buff = ARGS_INITIAL_VALUE;
|
||||||
|
int ret = 0;
|
||||||
|
std::string container_executor_cfg_contents = "[docker]\n"
|
||||||
|
" docker.trusted.registries=hadoop\n"
|
||||||
|
" docker.allowed.runtimes=lxc,nvidia";
|
||||||
|
std::vector<std::pair<std::string, std::string> > file_cmd_vec;
|
||||||
|
file_cmd_vec.push_back(std::make_pair<std::string, std::string>(
|
||||||
|
"[docker-command-execution]\n docker-command=run\n image=hadoop/image\n runtime=lxc", "--runtime=lxc"));
|
||||||
|
file_cmd_vec.push_back(std::make_pair<std::string, std::string>(
|
||||||
|
"[docker-command-execution]\n docker-command=run\n image=hadoop/image\n runtime=nvidia", "--runtime=nvidia"));
|
||||||
|
file_cmd_vec.push_back(std::make_pair<std::string, std::string>(
|
||||||
|
"[docker-command-execution]\n docker-command=run", ""));
|
||||||
|
write_container_executor_cfg(container_executor_cfg_contents);
|
||||||
|
ret = read_config(container_executor_cfg_file.c_str(), &container_cfg);
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, std::string> >::const_iterator itr;
|
||||||
|
if (ret != 0) {
|
||||||
|
FAIL();
|
||||||
|
}
|
||||||
|
for (itr = file_cmd_vec.begin(); itr != file_cmd_vec.end(); ++itr) {
|
||||||
|
struct configuration cmd_cfg;
|
||||||
|
write_command_file(itr->first);
|
||||||
|
ret = read_config(docker_command_file.c_str(), &cmd_cfg);
|
||||||
|
if (ret != 0) {
|
||||||
|
FAIL();
|
||||||
|
}
|
||||||
|
ret = set_runtime(&cmd_cfg, &container_cfg, &buff);
|
||||||
|
char *actual = flatten(&buff);
|
||||||
|
ASSERT_EQ(0, ret) << "error message: " << get_docker_error_message(ret) << " for input " << itr->first;
|
||||||
|
ASSERT_STREQ(itr->second.c_str(), actual);
|
||||||
|
reset_args(&buff);
|
||||||
|
free(actual);
|
||||||
|
free_configuration(&cmd_cfg);
|
||||||
|
}
|
||||||
|
struct configuration cmd_cfg_1;
|
||||||
|
write_command_file("[docker-command-execution]\n docker-command=run\n runtime=nvidia1");
|
||||||
|
ret = read_config(docker_command_file.c_str(), &cmd_cfg_1);
|
||||||
|
if (ret != 0) {
|
||||||
|
FAIL();
|
||||||
|
}
|
||||||
|
ret = set_runtime(&cmd_cfg_1, &container_cfg, &buff);
|
||||||
|
ASSERT_EQ(INVALID_DOCKER_RUNTIME, ret);
|
||||||
|
ASSERT_EQ(0, buff.length);
|
||||||
|
reset_args(&buff);
|
||||||
|
free_configuration(&container_cfg);
|
||||||
|
|
||||||
|
container_executor_cfg_contents = "[docker]\n";
|
||||||
|
write_container_executor_cfg(container_executor_cfg_contents);
|
||||||
|
ret = read_config(container_executor_cfg_file.c_str(), &container_cfg);
|
||||||
|
if (ret != 0) {
|
||||||
|
FAIL();
|
||||||
|
}
|
||||||
|
ret = set_runtime(&cmd_cfg_1, &container_cfg, &buff);
|
||||||
|
ASSERT_EQ(INVALID_DOCKER_RUNTIME, ret);
|
||||||
|
ASSERT_EQ(0, buff.length);
|
||||||
|
reset_args(&buff);
|
||||||
|
free_configuration(&cmd_cfg_1);
|
||||||
|
free_configuration(&container_cfg);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(TestDockerUtil, test_set_group_add) {
|
TEST_F(TestDockerUtil, test_set_group_add) {
|
||||||
std::vector<std::pair<std::string, std::string> > file_cmd_vec;
|
std::vector<std::pair<std::string, std::string> > file_cmd_vec;
|
||||||
file_cmd_vec.push_back(std::make_pair<std::string, std::string>(
|
file_cmd_vec.push_back(std::make_pair<std::string, std::string>(
|
||||||
|
|
|
@ -59,6 +59,7 @@ public class TestDockerRunCommand {
|
||||||
dockerRunCommand.setOverrideCommandWithArgs(commands);
|
dockerRunCommand.setOverrideCommandWithArgs(commands);
|
||||||
dockerRunCommand.removeContainerOnExit();
|
dockerRunCommand.removeContainerOnExit();
|
||||||
dockerRunCommand.addTmpfsMount("/run");
|
dockerRunCommand.addTmpfsMount("/run");
|
||||||
|
dockerRunCommand.addRuntime("nvidia");
|
||||||
|
|
||||||
assertEquals("run", StringUtils.join(",",
|
assertEquals("run", StringUtils.join(",",
|
||||||
dockerRunCommand.getDockerCommandWithArguments()
|
dockerRunCommand.getDockerCommandWithArguments()
|
||||||
|
@ -79,7 +80,9 @@ public class TestDockerRunCommand {
|
||||||
.get("launch-command")));
|
.get("launch-command")));
|
||||||
assertEquals("/run", StringUtils.join(",",
|
assertEquals("/run", StringUtils.join(",",
|
||||||
dockerRunCommand.getDockerCommandWithArguments().get("tmpfs")));
|
dockerRunCommand.getDockerCommandWithArguments().get("tmpfs")));
|
||||||
assertEquals(8, dockerRunCommand.getDockerCommandWithArguments().size());
|
assertEquals("nvidia", StringUtils.join(",",
|
||||||
|
dockerRunCommand.getDockerCommandWithArguments().get("runtime")));
|
||||||
|
assertEquals(9, dockerRunCommand.getDockerCommandWithArguments().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test for NvidiaDockerV2CommandPlugin.
|
||||||
|
*/
|
||||||
|
public class TestNvidiaDockerV2CommandPlugin {
|
||||||
|
private Map<String, List<String>> copyCommandLine(
|
||||||
|
Map<String, List<String>> map) {
|
||||||
|
Map<String, List<String>> ret = new HashMap<>();
|
||||||
|
for (Map.Entry<String, List<String>> entry : map.entrySet()) {
|
||||||
|
ret.put(entry.getKey(), new ArrayList<>(entry.getValue()));
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean commandlinesEquals(Map<String, List<String>> cli1,
|
||||||
|
Map<String, List<String>> cli2) {
|
||||||
|
if (!Sets.symmetricDifference(cli1.keySet(), cli2.keySet()).isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String key : cli1.keySet()) {
|
||||||
|
List<String> value1 = cli1.get(key);
|
||||||
|
List<String> value2 = cli2.get(key);
|
||||||
|
if (!value1.equals(value2)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class MyNvidiaDockerV2CommandPlugin
|
||||||
|
extends NvidiaDockerV2CommandPlugin {
|
||||||
|
private boolean requestsGpu = false;
|
||||||
|
|
||||||
|
MyNvidiaDockerV2CommandPlugin() {}
|
||||||
|
|
||||||
|
public void setRequestsGpu(boolean r) {
|
||||||
|
requestsGpu = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean requestsGpu(Container container) {
|
||||||
|
return requestsGpu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPlugin() throws Exception {
|
||||||
|
DockerRunCommand runCommand = new DockerRunCommand("container_1", "user",
|
||||||
|
"fakeimage");
|
||||||
|
|
||||||
|
Map<String, List<String>> originalCommandline = copyCommandLine(
|
||||||
|
runCommand.getDockerCommandWithArguments());
|
||||||
|
|
||||||
|
MyNvidiaDockerV2CommandPlugin
|
||||||
|
commandPlugin = new MyNvidiaDockerV2CommandPlugin();
|
||||||
|
|
||||||
|
Container nmContainer = mock(Container.class);
|
||||||
|
|
||||||
|
// getResourceMapping is null, so commandline won't be updated
|
||||||
|
commandPlugin.updateDockerRunCommand(runCommand, nmContainer);
|
||||||
|
Assert.assertTrue(commandlinesEquals(originalCommandline,
|
||||||
|
runCommand.getDockerCommandWithArguments()));
|
||||||
|
|
||||||
|
// no GPU resource assigned, so commandline won't be updated
|
||||||
|
ResourceMappings resourceMappings = new ResourceMappings();
|
||||||
|
when(nmContainer.getResourceMappings()).thenReturn(resourceMappings);
|
||||||
|
commandPlugin.updateDockerRunCommand(runCommand, nmContainer);
|
||||||
|
Assert.assertTrue(commandlinesEquals(originalCommandline,
|
||||||
|
runCommand.getDockerCommandWithArguments()));
|
||||||
|
|
||||||
|
// Assign GPU resource
|
||||||
|
ResourceMappings.AssignedResources assigned =
|
||||||
|
new ResourceMappings.AssignedResources();
|
||||||
|
assigned.updateAssignedResources(
|
||||||
|
ImmutableList.of(new GpuDevice(0, 0), new GpuDevice(1, 1)));
|
||||||
|
resourceMappings.addAssignedResources(ResourceInformation.GPU_URI,
|
||||||
|
assigned);
|
||||||
|
|
||||||
|
commandPlugin.setRequestsGpu(true);
|
||||||
|
commandPlugin.updateDockerRunCommand(runCommand, nmContainer);
|
||||||
|
Map<String, List<String>> newCommandLine =
|
||||||
|
runCommand.getDockerCommandWithArguments();
|
||||||
|
|
||||||
|
// Command line will be updated
|
||||||
|
Assert.assertFalse(commandlinesEquals(originalCommandline, newCommandLine));
|
||||||
|
// NVIDIA_VISIBLE_DEVICES will be set
|
||||||
|
Assert.assertTrue(
|
||||||
|
runCommand.getEnv().get("NVIDIA_VISIBLE_DEVICES").equals("0,1"));
|
||||||
|
// runtime should exist
|
||||||
|
Assert.assertTrue(newCommandLine.containsKey("runtime"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -274,6 +274,7 @@ are allowed. It contains the following properties:
|
||||||
| `docker.trusted.registries` | Comma separated list of trusted docker registries for running trusted privileged docker containers. By default, no registries are defined. |
|
| `docker.trusted.registries` | Comma separated list of trusted docker registries for running trusted privileged docker containers. By default, no registries are defined. |
|
||||||
| `docker.inspect.max.retries` | Integer value to check docker container readiness. Each inspection is set with 3 seconds delay. Default value of 10 will wait 30 seconds for docker container to become ready before marked as container failed. |
|
| `docker.inspect.max.retries` | Integer value to check docker container readiness. Each inspection is set with 3 seconds delay. Default value of 10 will wait 30 seconds for docker container to become ready before marked as container failed. |
|
||||||
| `docker.no-new-privileges.enabled` | Enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default. |
|
| `docker.no-new-privileges.enabled` | Enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default. |
|
||||||
|
| `docker.allowed.runtimes` | Comma seperated runtimes that containers are allowed to use. By default no runtimes are allowed to be added.|
|
||||||
|
|
||||||
Please note that if you wish to run Docker containers that require access to the YARN local directories, you must add them to the docker.allowed.rw-mounts list.
|
Please note that if you wish to run Docker containers that require access to the YARN local directories, you must add them to the docker.allowed.rw-mounts list.
|
||||||
|
|
||||||
|
|
|
@ -107,7 +107,7 @@ Following configs can be customized when user needs to run GPU applications insi
|
||||||
| --- | --- |
|
| --- | --- |
|
||||||
| yarn.nodemanager.resource-plugins.gpu.docker-plugin | nvidia-docker-v1 |
|
| yarn.nodemanager.resource-plugins.gpu.docker-plugin | nvidia-docker-v1 |
|
||||||
|
|
||||||
Specify docker command plugin for GPU. By default uses Nvidia docker V1.0.
|
Specify docker command plugin for GPU. By default uses Nvidia docker V1.0, `nvidia-docker-v2` is available for V2.x.
|
||||||
|
|
||||||
| Property | Default value |
|
| Property | Default value |
|
||||||
| --- | --- |
|
| --- | --- |
|
||||||
|
@ -169,6 +169,13 @@ docker.allowed.volume-drivers
|
||||||
...
|
...
|
||||||
docker.allowed.ro-mounts=nvidia_driver_375.66
|
docker.allowed.ro-mounts=nvidia_driver_375.66
|
||||||
```
|
```
|
||||||
|
**4) If use `nvidia-docker-v2` as gpu docker plugin, add `nvidia` to runtimes whitelist.**
|
||||||
|
|
||||||
|
```
|
||||||
|
[docker]
|
||||||
|
...
|
||||||
|
docker.allowed.runtimes=nvidia
|
||||||
|
```
|
||||||
|
|
||||||
# Use it
|
# Use it
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue