From c968568c589ab0f87982b7e08b941acb15a9c9f6 Mon Sep 17 00:00:00 2001 From: Sunil G Date: Tue, 8 Jan 2019 07:12:11 +0530 Subject: [PATCH] YARN-8822. Nvidia-docker v2 support for YARN GPU feature. (Charo Zhang via Sunil Govindan) --- .../hadoop-yarn/conf/container-executor.cfg | 1 + .../hadoop/yarn/conf/YarnConfiguration.java | 3 + .../runtime/docker/DockerRunCommand.java | 5 + .../gpu/GpuDockerCommandPluginFactory.java | 4 + .../gpu/NvidiaDockerV2CommandPlugin.java | 111 +++++++++++++++ .../impl/utils/docker-util.c | 20 +++ .../impl/utils/docker-util.h | 3 +- .../test/utils/test_docker_util.cc | 62 +++++++++ .../runtime/docker/TestDockerRunCommand.java | 5 +- .../gpu/TestNvidiaDockerV2CommandPlugin.java | 130 ++++++++++++++++++ .../src/site/markdown/DockerContainers.md | 1 + .../src/site/markdown/UsingGpus.md | 9 +- 12 files changed, 351 insertions(+), 3 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java diff --git a/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg b/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg index d19874f2421..4df53df6892 100644 --- a/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg +++ b/hadoop-yarn-project/hadoop-yarn/conf/container-executor.cfg @@ -16,6 +16,7 @@ feature.tc.enabled=false # docker.privileged-containers.enabled=false # docker.allowed.volume-drivers=## comma seperated list of allowed volume-drivers # docker.no-new-privileges.enabled=## enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default +# docker.allowed.runtimes=## comma seperated runtimes that can be used. # The configs below deal with settings for FPGA resource #[fpga] diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 95861d7fbfb..5f0ad9a0058 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1635,6 +1635,9 @@ public class YarnConfiguration extends Configuration { @Private public static final String NVIDIA_DOCKER_V1 = "nvidia-docker-v1"; + @Private + public static final String NVIDIA_DOCKER_V2 = "nvidia-docker-v2"; + @Private public static final String DEFAULT_NM_GPU_DOCKER_PLUGIN_IMPL = NVIDIA_DOCKER_V1; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java index 395c1e173ee..061cab13a5a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/DockerRunCommand.java @@ -159,6 +159,11 @@ public class DockerRunCommand extends DockerCommand { return this; } + public DockerRunCommand addRuntime(String runtime) { + super.addCommandArguments("runtime", runtime); + return this; + } + public DockerRunCommand groupAdd(String[] groups) { super.addCommandArguments("group-add", String.join(",", groups)); return this; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java index db4589a1fb5..051afd6c561 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDockerCommandPluginFactory.java @@ -34,6 +34,10 @@ public class GpuDockerCommandPluginFactory { if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V1)) { return new NvidiaDockerV1CommandPlugin(conf); } + // nvidia-docker2 + if (impl.equals(YarnConfiguration.NVIDIA_DOCKER_V2)) { + return new NvidiaDockerV2CommandPlugin(); + } throw new YarnException( "Unkown implementation name for Gpu docker plugin, impl=" + impl); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java new file mode 100644 index 00000000000..ff25eb6ced6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaDockerV2CommandPlugin.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerVolumeCommand; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.DockerCommandPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; + +import java.io.Serializable; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * Implementation to use nvidia-docker v2 as GPU docker command plugin. + */ +public class NvidiaDockerV2CommandPlugin implements DockerCommandPlugin { + final static Log LOG = LogFactory.getLog(NvidiaDockerV2CommandPlugin.class); + + private String nvidiaRuntime = "nvidia"; + private String nvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"; + + public NvidiaDockerV2CommandPlugin() {} + + private Set getAssignedGpus(Container container) { + ResourceMappings resourceMappings = container.getResourceMappings(); + + // Copy of assigned Resources + Set assignedResources = null; + if (resourceMappings != null) { + assignedResources = new HashSet<>(); + for (Serializable s : resourceMappings.getAssignedResources( + ResourceInformation.GPU_URI)) { + assignedResources.add((GpuDevice) s); + } + } + if (assignedResources == null || assignedResources.isEmpty()) { + // When no GPU resource assigned, don't need to update docker command. + return Collections.emptySet(); + } + return assignedResources; + } + + @VisibleForTesting + protected boolean requestsGpu(Container container) { + return GpuResourceAllocator.getRequestedGpus(container.getResource()) > 0; + } + + @Override + public synchronized void updateDockerRunCommand( + DockerRunCommand dockerRunCommand, Container container) + throws ContainerExecutionException { + if (!requestsGpu(container)) { + return; + } + Set assignedResources = getAssignedGpus(container); + if (assignedResources == null || assignedResources.isEmpty()) { + return; + } + Map environment = new HashMap<>(); + String gpuIndexList = ""; + for (GpuDevice gpuDevice : assignedResources) { + gpuIndexList = gpuIndexList + gpuDevice.getIndex() + ","; + LOG.info("nvidia docker2 assigned gpu index: " + gpuDevice.getIndex()); + } + dockerRunCommand.addRuntime(nvidiaRuntime); + environment.put(nvidiaVisibleDevices, + gpuIndexList.substring(0, gpuIndexList.length() - 1)); + dockerRunCommand.addEnv(environment); + } + + @Override + public DockerVolumeCommand getCreateDockerVolumeCommand(Container container) + throws ContainerExecutionException { + // No Volume needed for nvidia-docker2. + return null; + } + + @Override + public DockerVolumeCommand getCleanupDockerVolumesCommand(Container container) + throws ContainerExecutionException { + // No cleanup needed. + return null; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c index 69f27ba661f..548430bf30d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.c @@ -342,6 +342,8 @@ const char *get_docker_error_message(const int error_code) { return "Docker image is not trusted"; case INVALID_DOCKER_TMPFS_MOUNT: return "Invalid docker tmpfs mount"; + case INVALID_DOCKER_RUNTIME: + return "Invalid docker runtime"; default: return "Unknown error"; } @@ -883,6 +885,19 @@ static int set_network(const struct configuration *command_config, return ret; } +static int set_runtime(const struct configuration *command_config, + const struct configuration *conf, args *args) { + int ret = 0; + ret = add_param_to_command_if_allowed(command_config, conf, "runtime", + "docker.allowed.runtimes", "--runtime=", + 0, 0, args); + if (ret != 0) { + fprintf(ERRORFILE, "Could not find requested runtime in allowed runtimes\n"); + ret = INVALID_DOCKER_RUNTIME; + } + return ret; +} + static int set_pid_namespace(const struct configuration *command_config, const struct configuration *conf, args *args) { char *value = get_configuration_value("pid", DOCKER_COMMAND_FILE_SECTION, @@ -1527,6 +1542,11 @@ int get_docker_run_command(const char *command_file, const struct configuration goto free_and_exit; } + ret = set_runtime(&command_config, conf, args); + if (ret != 0) { + goto free_and_exit; + } + ret = set_hostname(&command_config, args); if (ret != 0) { goto free_and_exit; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h index 7b7322dc5d8..0b281cc791c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/utils/docker-util.h @@ -67,7 +67,8 @@ enum docker_error_codes { PID_HOST_DISABLED, INVALID_PID_NAMESPACE, INVALID_DOCKER_IMAGE_TRUST, - INVALID_DOCKER_TMPFS_MOUNT + INVALID_DOCKER_TMPFS_MOUNT, + INVALID_DOCKER_RUNTIME }; /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc index b289857ed24..dba1947b537 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/utils/test_docker_util.cc @@ -432,6 +432,68 @@ namespace ContainerExecutor { run_docker_run_helper_function(file_cmd_vec, set_hostname); } + TEST_F(TestDockerUtil, test_set_runtime) { + struct configuration container_cfg; + struct args buff = ARGS_INITIAL_VALUE; + int ret = 0; + std::string container_executor_cfg_contents = "[docker]\n" + " docker.trusted.registries=hadoop\n" + " docker.allowed.runtimes=lxc,nvidia"; + std::vector > file_cmd_vec; + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n image=hadoop/image\n runtime=lxc", "--runtime=lxc")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run\n image=hadoop/image\n runtime=nvidia", "--runtime=nvidia")); + file_cmd_vec.push_back(std::make_pair( + "[docker-command-execution]\n docker-command=run", "")); + write_container_executor_cfg(container_executor_cfg_contents); + ret = read_config(container_executor_cfg_file.c_str(), &container_cfg); + + std::vector >::const_iterator itr; + if (ret != 0) { + FAIL(); + } + for (itr = file_cmd_vec.begin(); itr != file_cmd_vec.end(); ++itr) { + struct configuration cmd_cfg; + write_command_file(itr->first); + ret = read_config(docker_command_file.c_str(), &cmd_cfg); + if (ret != 0) { + FAIL(); + } + ret = set_runtime(&cmd_cfg, &container_cfg, &buff); + char *actual = flatten(&buff); + ASSERT_EQ(0, ret) << "error message: " << get_docker_error_message(ret) << " for input " << itr->first; + ASSERT_STREQ(itr->second.c_str(), actual); + reset_args(&buff); + free(actual); + free_configuration(&cmd_cfg); + } + struct configuration cmd_cfg_1; + write_command_file("[docker-command-execution]\n docker-command=run\n runtime=nvidia1"); + ret = read_config(docker_command_file.c_str(), &cmd_cfg_1); + if (ret != 0) { + FAIL(); + } + ret = set_runtime(&cmd_cfg_1, &container_cfg, &buff); + ASSERT_EQ(INVALID_DOCKER_RUNTIME, ret); + ASSERT_EQ(0, buff.length); + reset_args(&buff); + free_configuration(&container_cfg); + + container_executor_cfg_contents = "[docker]\n"; + write_container_executor_cfg(container_executor_cfg_contents); + ret = read_config(container_executor_cfg_file.c_str(), &container_cfg); + if (ret != 0) { + FAIL(); + } + ret = set_runtime(&cmd_cfg_1, &container_cfg, &buff); + ASSERT_EQ(INVALID_DOCKER_RUNTIME, ret); + ASSERT_EQ(0, buff.length); + reset_args(&buff); + free_configuration(&cmd_cfg_1); + free_configuration(&container_cfg); + } + TEST_F(TestDockerUtil, test_set_group_add) { std::vector > file_cmd_vec; file_cmd_vec.push_back(std::make_pair( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java index 8dc37d4b1f1..23483d37807 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/docker/TestDockerRunCommand.java @@ -59,6 +59,7 @@ public class TestDockerRunCommand { dockerRunCommand.setOverrideCommandWithArgs(commands); dockerRunCommand.removeContainerOnExit(); dockerRunCommand.addTmpfsMount("/run"); + dockerRunCommand.addRuntime("nvidia"); assertEquals("run", StringUtils.join(",", dockerRunCommand.getDockerCommandWithArguments() @@ -79,7 +80,9 @@ public class TestDockerRunCommand { .get("launch-command"))); assertEquals("/run", StringUtils.join(",", dockerRunCommand.getDockerCommandWithArguments().get("tmpfs"))); - assertEquals(8, dockerRunCommand.getDockerCommandWithArguments().size()); + assertEquals("nvidia", StringUtils.join(",", + dockerRunCommand.getDockerCommandWithArguments().get("runtime"))); + assertEquals(9, dockerRunCommand.getDockerCommandWithArguments().size()); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java new file mode 100644 index 00000000000..b0b523360ef --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestNvidiaDockerV2CommandPlugin.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * test for NvidiaDockerV2CommandPlugin. + */ +public class TestNvidiaDockerV2CommandPlugin { + private Map> copyCommandLine( + Map> map) { + Map> ret = new HashMap<>(); + for (Map.Entry> entry : map.entrySet()) { + ret.put(entry.getKey(), new ArrayList<>(entry.getValue())); + } + return ret; + } + + private boolean commandlinesEquals(Map> cli1, + Map> cli2) { + if (!Sets.symmetricDifference(cli1.keySet(), cli2.keySet()).isEmpty()) { + return false; + } + + for (String key : cli1.keySet()) { + List value1 = cli1.get(key); + List value2 = cli2.get(key); + if (!value1.equals(value2)) { + return false; + } + } + + return true; + } + + static class MyNvidiaDockerV2CommandPlugin + extends NvidiaDockerV2CommandPlugin { + private boolean requestsGpu = false; + + MyNvidiaDockerV2CommandPlugin() {} + + public void setRequestsGpu(boolean r) { + requestsGpu = r; + } + + @Override + protected boolean requestsGpu(Container container) { + return requestsGpu; + } + } + + @Test + public void testPlugin() throws Exception { + DockerRunCommand runCommand = new DockerRunCommand("container_1", "user", + "fakeimage"); + + Map> originalCommandline = copyCommandLine( + runCommand.getDockerCommandWithArguments()); + + MyNvidiaDockerV2CommandPlugin + commandPlugin = new MyNvidiaDockerV2CommandPlugin(); + + Container nmContainer = mock(Container.class); + + // getResourceMapping is null, so commandline won't be updated + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // no GPU resource assigned, so commandline won't be updated + ResourceMappings resourceMappings = new ResourceMappings(); + when(nmContainer.getResourceMappings()).thenReturn(resourceMappings); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Assert.assertTrue(commandlinesEquals(originalCommandline, + runCommand.getDockerCommandWithArguments())); + + // Assign GPU resource + ResourceMappings.AssignedResources assigned = + new ResourceMappings.AssignedResources(); + assigned.updateAssignedResources( + ImmutableList.of(new GpuDevice(0, 0), new GpuDevice(1, 1))); + resourceMappings.addAssignedResources(ResourceInformation.GPU_URI, + assigned); + + commandPlugin.setRequestsGpu(true); + commandPlugin.updateDockerRunCommand(runCommand, nmContainer); + Map> newCommandLine = + runCommand.getDockerCommandWithArguments(); + + // Command line will be updated + Assert.assertFalse(commandlinesEquals(originalCommandline, newCommandLine)); + // NVIDIA_VISIBLE_DEVICES will be set + Assert.assertTrue( + runCommand.getEnv().get("NVIDIA_VISIBLE_DEVICES").equals("0,1")); + // runtime should exist + Assert.assertTrue(newCommandLine.containsKey("runtime")); + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md index 17a335ef422..acdf04dce8c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/DockerContainers.md @@ -274,6 +274,7 @@ are allowed. It contains the following properties: | `docker.trusted.registries` | Comma separated list of trusted docker registries for running trusted privileged docker containers. By default, no registries are defined. | | `docker.inspect.max.retries` | Integer value to check docker container readiness. Each inspection is set with 3 seconds delay. Default value of 10 will wait 30 seconds for docker container to become ready before marked as container failed. | | `docker.no-new-privileges.enabled` | Enable/disable the no-new-privileges flag for docker run. Set to "true" to enable, disabled by default. | +| `docker.allowed.runtimes` | Comma seperated runtimes that containers are allowed to use. By default no runtimes are allowed to be added.| Please note that if you wish to run Docker containers that require access to the YARN local directories, you must add them to the docker.allowed.rw-mounts list. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md index f6000e7c35e..85412af88e4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md @@ -107,7 +107,7 @@ Following configs can be customized when user needs to run GPU applications insi | --- | --- | | yarn.nodemanager.resource-plugins.gpu.docker-plugin | nvidia-docker-v1 | -Specify docker command plugin for GPU. By default uses Nvidia docker V1.0. +Specify docker command plugin for GPU. By default uses Nvidia docker V1.0, `nvidia-docker-v2` is available for V2.x. | Property | Default value | | --- | --- | @@ -169,6 +169,13 @@ docker.allowed.volume-drivers ... docker.allowed.ro-mounts=nvidia_driver_375.66 ``` +**4) If use `nvidia-docker-v2` as gpu docker plugin, add `nvidia` to runtimes whitelist.** + +``` +[docker] +... +docker.allowed.runtimes=nvidia +``` # Use it