From 0a01d499175569ffac9d3f31d980820ecac7e60b Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Mon, 7 Jan 2019 12:07:26 -0800 Subject: [PATCH] YARN-8822. Nvidia-docker v2 support for YARN GPU feature. (Charo Zhang via wangda) Change-Id: Id8af27134d3286a7a10d85eda9be25df9689d0e7 --- .../hadoop-yarn/conf/container-executor.cfg | 1 + .../hadoop/yarn/conf/YarnConfiguration.java | 3 + .../runtime/docker/DockerRunCommand.java | 5 + .../gpu/GpuDockerCommandPluginFactory.java | 4 + .../gpu/NvidiaDockerV2CommandPlugin.java | 111 +++++++++++++++ .../impl/utils/docker-util.c | 20 +++ .../impl/utils/docker-util.h | 3 +- .../test/utils/test_docker_util.cc | 62 +++++++++ .../runtime/docker/TestDockerRunCommand.java | 5 +- .../gpu/TestNvidiaDockerV2CommandPlugin.java | 130 ++++++++++++++++++ .../src/site/markdown/DockerContainers.md | 1 + .../src/site/markdown/UsingGpus.md | 9 +- 12 files changed, 351 insertions(+), 3 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java diff --git a/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg b/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg index d19874f2421..4df53df6892 100644 --- a/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg +++ b/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg @@ -16,6 +16,7 @@ feature.tc.enabled=false # docker.privileged-containers.enabled=false # docker.allowed.volume-drivers=## comma seperated list of allowed volume-drivers # docker.no-new-privileges.enabled=## enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default +# docker.allowed.runtimes=## comma seperated runtimes that can be used. # The configs below deal with settings for FPGA resource #[fpga] diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 08f7f1a18a9..c29707c82ee 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1664,6 +1664,9 @@ public class YarnConfiguration extends Configuration { @Private public static final String NVIDIA_DOCKER_V1 = "nvidia-docker-v1"; + @Private + public static final String NVIDIA_DOCKER_V2 = "nvidia-docker-v2"; + @Private public static final String DEFAULT_NM_GPU_DOCKER_PLUGIN_IMPL = NVIDIA_DOCKER_V1; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java index f4f3a9c4fac..b0603a3a220 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java @@ -165,6 +165,11 @@ public class DockerRunCommand extends DockerCommand { return this; } + public DockerRunCommand addRuntime(String runtime) { + super.addCommandArguments("runtime", runtime); + return this; + } + public DockerRunCommand groupAdd(String[] groups) { super.addCommandArguments("group-add", String.join(",", groups)); return this; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java index db4589a1fb5..051afd6c561 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java @@ -34,6 +34,10 @@ public class GpuDockerCommandPluginFactory { if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V1)) { return new NvidiaDockerV1CommandPlugin(conf); } + // nvidia-docker2 + if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V2)) { + return new NvidiaDockerV2CommandPlugin(); + } throw new YarnException( "Unkown implementation name for Gpu docker plugin, impl=" + impl); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java new file mode 100644 index 00000000000..ff25eb6ced6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerVolumeCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; + +import java.io.Serializable; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * Implementation to use nvidia-docker v2 as GPU docker command plugin. + */ +public class NvidiaDockerV2CommandPlugin implements DockerCommandPlugin { + final static Log LOG = LogFactory.getLog(NvidiaDockerV2CommandPlugin.class); + + private String nvidiaRuntime = "nvidia"; + private String nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"; + + public NvidiaDockerV2CommandPlugin() {} + + private Set getAssignedGpus(Container container) { + ResourceMappings resourceMappings = container.getResourceMappings(); + + // Copy of assigned Resources + Set assignedResources = null; + if (resourceMappings != null) { + assignedResources = new HashSet<>(); + for (Serializable s : resourceMappings.getAssignedResources( + ResourceInformation.GPU_URI)) { + assignedResources.add((GpuDevice) s); + } + } + if (assignedResources == null || assignedResources.isEmpty()) { + // When no GPU resource assigned, don't need to update docker command. + return Collections.emptySet(); + } + return assignedResources; + } + + @VisibleForTesting + protected boolean requestsGpu(Container container) { + return GpuResourceAllocator.getRequestedGpus(container.getResource()) > 0; + } + + @Override + public synchronized void updateDockerRunCommand( + DockerRunCommand dockerRunCommand, Container container) + throws ContainerExecutionException { + if (!requestsGpu(container)) { + return; + } + Set assignedResources = getAssignedGpus(container); + if (assignedResources == null || assignedResources.isEmpty()) { + return; + } + Map environment = new HashMap<>(); + String gpuIndexList = ""; + for (GpuDevice gpuDevice : assignedResources) { + gpuIndexList = gpuIndexList + gpuDevice.getIndex() + ","; + LOG.info("nvidia docker2 assigned gpu index: " + gpuDevice.getIndex()); + } + dockerRunCommand.addRuntime(nvidiaRuntime); + environment.put(nvidiaVisibleDevices, + gpuIndexList.substring(0, gpuIndexList.length() - 1)); + dockerRunCommand.addEnv(environment); + } + + @Override + public DockerVolumeCommand getCreateDockerVolumeCommand(Container container) + throws ContainerExecutionException { + // No Volume needed for nvidia-docker2. + return null; + } + + @Override + public DockerVolumeCommand getCleanupDockerVolumesCommand(Container container) + throws ContainerExecutionException { + // No cleanup needed. + return null; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c index ced7424e569..0a5d2edec90 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c @@ -351,6 +351,8 @@ const char *get_docker_error_message(const int error_code) { return "Docker image is not trusted"; case INVALID_DOCKER_TMPFS_MOUNT: return "Invalid docker tmpfs mount"; + case INVALID_DOCKER_RUNTIME: + return "Invalid docker runtime"; default: return "Unknown error"; } @@ -947,6 +949,19 @@ static int set_network(const struct configuration *command_config, return ret; } +static int set_runtime(const struct configuration *command_config, + const struct configuration *conf, args *args) { + int ret = 0; + ret = add_param_to_command_if_allowed(command_config, conf, "runtime", + "docker.allowed.runtimes", "--runtime=", + 0, 0, args); + if (ret != 0) { + fprintf(ERRORFILE, "Could not find requested runtime in allowed runtimes\n"); + ret = INVALID_DOCKER_RUNTIME; + } + return ret; +} + static int add_ports_mapping_to_command(const struct configuration *command_config, args *args) { int i = 0, ret = 0; char *network_type = (char*) malloc(128); @@ -1654,6 +1669,11 @@ int get_docker_run_command(const char *command_file, const struct configuration goto free_and_exit; } + ret = set_runtime(&command_config, conf, args); + if (ret != 0) { + goto free_and_exit; + } + ret = set_hostname(&command_config, args); if (ret != 0) { goto free_and_exit; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h index 3cff565b6c5..3b8922def34 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h @@ -69,7 +69,8 @@ enum docker_error_codes { PID_HOST_DISABLED, INVALID_PID_NAMESPACE, INVALID_DOCKER_IMAGE_TRUST, - INVALID_DOCKER_TMPFS_MOUNT + INVALID_DOCKER_TMPFS_MOUNT, + INVALID_DOCKER_RUNTIME }; /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc index 66e987ea30d..6c239d293a2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc @@ -444,6 +444,68 @@ namespace ContainerExecutor { run_docker_run_helper_function(file_cmd_vec, set_hostname); } + TEST_F(TestDockerUtil, test_set_runtime) { + struct configuration container_cfg; + struct args buff = ARGS_INITIAL_VALUE; + int ret = 0; + std::string container_executor_cfg_contents = "[docker]\n" + " docker.trusted.registries=hadoop\n" + " docker.allowed.runtimes=lxc,nvidia"; + std::vector > file_cmd_vec; + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n image=hadoop/image\n runtime=lxc", "--runtime=lxc")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n image=hadoop/image\n runtime=nvidia", "--runtime=nvidia")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run", "")); + write_container_executor_cfg(container_executor_cfg_contents); + ret = read_config(container_executor_cfg_file.c_str(), &container_cfg); + + std::vector >::const_iterator itr; + if (ret != 0) { + FAIL(); + } + for (itr = file_cmd_vec.begin(); itr != file_cmd_vec.end(); ++itr) { + struct configuration cmd_cfg; + write_command_file(itr->first); + ret = read_config(docker_command_file.c_str(), &cmd_cfg); + if (ret != 0) { + FAIL(); + } + ret = set_runtime(&cmd_cfg, &container_cfg, &buff); + char *actual = flatten(&buff); + ASSERT_EQ(0, ret) << "error message: " << get_docker_error_message(ret) << " for input " << itr->first; + ASSERT_STREQ(itr->second.c_str(), actual); + reset_args(&buff); + free(actual); + free_configuration(&cmd_cfg); + } + struct configuration cmd_cfg_1; + write_command_file("[docker-command-execution]\n docker-command=run\n runtime=nvidia1"); + ret = read_config(docker_command_file.c_str(), &cmd_cfg_1); + if (ret != 0) { + FAIL(); + } + ret = set_runtime(&cmd_cfg_1, &container_cfg, &buff); + ASSERT_EQ(INVALID_DOCKER_RUNTIME, ret); + ASSERT_EQ(0, buff.length); + reset_args(&buff); + free_configuration(&container_cfg); + + container_executor_cfg_contents = "[docker]\n"; + write_container_executor_cfg(container_executor_cfg_contents); + ret = read_config(container_executor_cfg_file.c_str(), &container_cfg); + if (ret != 0) { + FAIL(); + } + ret = set_runtime(&cmd_cfg_1, &container_cfg, &buff); + ASSERT_EQ(INVALID_DOCKER_RUNTIME, ret); + ASSERT_EQ(0, buff.length); + reset_args(&buff); + free_configuration(&cmd_cfg_1); + free_configuration(&container_cfg); + } + TEST_F(TestDockerUtil, test_set_group_add) { std::vector > file_cmd_vec; file_cmd_vec.push_back(std::make_pair( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java index d01c184e35a..4ee76c7f876 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java @@ -63,6 +63,7 @@ public class TestDockerRunCommand { for (String mapping:portsMapping.split(",")) { dockerRunCommand.addPortsMapping(mapping); } + dockerRunCommand.addRuntime("nvidia"); assertEquals("run", StringUtils.join(",", dockerRunCommand.getDockerCommandWithArguments() @@ -86,7 +87,9 @@ public class TestDockerRunCommand { assertEquals("127.0.0.1:8080:80,1234:1234,:2222", StringUtils.join(",", dockerRunCommand.getDockerCommandWithArguments() .get("ports-mapping"))); - assertEquals(9, dockerRunCommand.getDockerCommandWithArguments().size()); + assertEquals("nvidia", StringUtils.join(",", + dockerRunCommand.getDockerCommandWithArguments().get("runtime"))); + assertEquals(10, dockerRunCommand.getDockerCommandWithArguments().size()); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java new file mode 100644 index 00000000000..b0b523360ef --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * test for NvidiaDockerV2CommandPlugin. + */ +public class TestNvidiaDockerV2CommandPlugin { + private Map> copyCommandLine( + Map> map) { + Map> ret = new HashMap<>(); + for (Map.Entry> entry : map.entrySet()) { + ret.put(entry.getKey(), new ArrayList<>(entry.getValue())); + } + return ret; + } + + private boolean commandlinesEquals(Map> cli1, + Map> cli2) { + if (!Sets.symmetricDifference(cli1.keySet(), cli2.keySet()).isEmpty()) { + return false; + } + + for (String key : cli1.keySet()) { + List value1 = cli1.get(key); + List value2 = cli2.get(key); + if (!value1.equals(value2)) { + return false; + } + } + + return true; + } + + static class MyNvidiaDockerV2CommandPlugin + extends NvidiaDockerV2CommandPlugin { + private boolean requestsGpu = false; + + MyNvidiaDockerV2CommandPlugin() {} + + public void setRequestsGpu(boolean r) { + requestsGpu = r; + } + + @Override + protected boolean requestsGpu(Container container) { + return requestsGpu; + } + } + + @Test + public void testPlugin() throws Exception { + DockerRunCommand runCommand = new DockerRunCommand("container_1", "user", + "fakeimage"); + + Map> originalCommandline = copyCommandLine( + runCommand.getDockerCommandWithArguments()); + + MyNvidiaDockerV2CommandPlugin + commandPlugin = new MyNvidiaDockerV2CommandPlugin(); + + Container nmContainer = mock(Container.class); + + // getResourceMapping is null, so commandline won't be updated + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // no GPU resource assigned, so commandline won't be updated + ResourceMappings resourceMappings = new ResourceMappings(); + when(nmContainer.getResourceMappings()).thenReturn(resourceMappings); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // Assign GPU resource + ResourceMappings.AssignedResources assigned = + new ResourceMappings.AssignedResources(); + assigned.updateAssignedResources( + ImmutableList.of(new GpuDevice(0, 0), new GpuDevice(1, 1))); + resourceMappings.addAssignedResources(ResourceInformation.GPU_URI, + assigned); + + commandPlugin.setRequestsGpu(true); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Map> newCommandLine = + runCommand.getDockerCommandWithArguments(); + + // Command line will be updated + Assert.assertFalse(commandlinesEquals(originalCommandline, newCommandLine)); + // NVIDIA_VISIBLE_DEVICES will be set + Assert.assertTrue( + runCommand.getEnv().get("NVIDIA_VISIBLE_DEVICES").equals("0,1")); + // runtime should exist + Assert.assertTrue(newCommandLine.containsKey("runtime")); + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md index 56202992f4e..2a893e4cde8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md @@ -274,6 +274,7 @@ are allowed. It contains the following properties: | `docker.trusted.registries` | Comma separated list of trusted docker registries for running trusted privileged docker containers. By default, no registries are defined. | | `docker.inspect.max.retries` | Integer value to check docker container readiness. Each inspection is set with 3 seconds delay. Default value of 10 will wait 30 seconds for docker container to become ready before marked as container failed. | | `docker.no-new-privileges.enabled` | Enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default. | +| `docker.allowed.runtimes` | Comma seperated runtimes that containers are allowed to use. By default no runtimes are allowed to be added.| Please note that if you wish to run Docker containers that require access to the YARN local directories, you must add them to the docker.allowed.rw-mounts list. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md index f6000e7c35e..85412af88e4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md @@ -107,7 +107,7 @@ Following configs can be customized when user needs to run GPU applications insi | --- | --- | | yarn.nodemanager.resource-plugins.gpu.docker-plugin | nvidia-docker-v1 | -Specify docker command plugin for GPU. By default uses Nvidia docker V1.0. +Specify docker command plugin for GPU. By default uses Nvidia docker V1.0, `nvidia-docker-v2` is available for V2.x. | Property | Default value | | --- | --- | @@ -169,6 +169,13 @@ docker.allowed.volume-drivers ... docker.allowed.ro-mounts=nvidia_driver_375.66 ``` +**4) If use `nvidia-docker-v2` as gpu docker plugin, add `nvidia` to runtimes whitelist.** + +``` +[docker] +... +docker.allowed.runtimes=nvidia +``` # Use it