From 6980f1740fe4037653a4095ed42dfe5b84d24850 Mon Sep 17 00:00:00 2001 From: Szilard Nemeth Date: Wed, 21 Aug 2019 16:49:34 +0200 Subject: [PATCH] YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko --- .../hadoop/yarn/conf/YarnConfiguration.java | 14 +++ .../src/main/resources/yarn-default.xml | 11 ++ .../resources/ResourcesExceptionUtil.java | 42 +++++++ .../resources/gpu/GpuResourceHandlerImpl.java | 13 +- .../resourceplugin/ResourcePluginManager.java | 7 +- .../resourceplugin/gpu/GpuDiscoverer.java | 114 ++++++++++-------- .../gpu/GpuNodeResourceUpdateHandler.java | 13 +- .../resourceplugin/gpu/GpuResourcePlugin.java | 35 +++++- .../gpu/NvidiaBinaryHelper.java | 63 ++++++++++ .../resources/gpu/TestGpuResourceHandler.java | 16 ++- .../resourceplugin/gpu/TestGpuDiscoverer.java | 47 ++++---- 11 files changed, 278 insertions(+), 97 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 79593eae120..04a70030cdd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1612,6 +1612,20 @@ public class YarnConfiguration extends Configuration { public static final String NM_RESOURCE_PLUGINS = NM_PREFIX + "resource-plugins"; + + /** + * Specifies whether the initialization of the Node Manager should continue + * if a certain device (GPU, FPGA, etc) was not found in the system. If set + * to "true", then an exception will be thrown if a device is missing or + * an error occurred during discovery. + */ + @Private + public static final String NM_RESOURCE_PLUGINS_FAIL_FAST = + NM_RESOURCE_PLUGINS + ".fail-fast"; + + @Private + public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true; + /** * Prefix for gpu configurations. Work in progress: This configuration * parameter may be changed/removed in the future. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 887a7c37584..f99977e23b1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -3798,6 +3798,17 @@ + + + Specifies whether the initialization of the Node Manager should continue + if a certain device (GPU, FPGA, etc) was not found in the system. If set + to "true", then an exception will be thrown if a device is missing or + an error occurred during discovery. + + yarn.nodemanager.resource-plugins.fail-fast + + + Specify GPU devices which can be managed by YARN NodeManager, split by comma diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java new file mode 100644 index 00000000000..f270f42440c --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST; +import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.exceptions.YarnException; + +/** + * Small utility class which only re-throws YarnException if + * NM_RESOURCE_PLUGINS_FAIL_FAST property is true. + * + */ +public final class ResourcesExceptionUtil { + private ResourcesExceptionUtil() {} + + public static void throwIfNecessary(YarnException e, Configuration conf) + throws YarnException { + if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST, + DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) { + throw e; + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index bcade9ead4e..00c8a85d2ae 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -18,6 +18,12 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -36,10 +42,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime. import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - public class GpuResourceHandlerImpl implements ResourceHandler { final static Log LOG = LogFactory .getLog(GpuResourceHandlerImpl.class); @@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler { String message = "GPU is enabled on the NodeManager, but couldn't find " + "any usable GPU devices, please double check configuration!"; LOG.error(message); - throw new ResourceHandlerException(message); + throwIfNecessary(new ResourceHandlerException(message), + configuration); } } catch (YarnException e) { LOG.error("Exception when trying to get usable GPU device", e); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java index 4ace3ae05a8..9e7652c6fab 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -60,7 +60,7 @@ public class ResourcePluginManager { Map pluginMap = Maps.newHashMap(); if (plugins != null) { - pluginMap = initializePlugins(context, plugins); + pluginMap = initializePlugins(conf, context, plugins); } configuredPlugins = Collections.unmodifiableMap(pluginMap); @@ -77,8 +77,7 @@ public class ResourcePluginManager { return plugins; } - - private Map initializePlugins( + private Map initializePlugins(Configuration conf, Context context, String[] plugins) throws YarnException { Map pluginMap = Maps.newHashMap(); @@ -91,7 +90,7 @@ public class ResourcePluginManager { if (resourceName.equals(GPU_URI)) { final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer(); final GpuNodeResourceUpdateHandler updateHandler = - new GpuNodeResourceUpdateHandler(gpuDiscoverer); + new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf); plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer); } else if (resourceName.equals(FPGA_URI)) { plugin = new FpgaResourcePlugin(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index ce767229e50..f710ff0bccd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -18,21 +18,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.Shell; -import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.exceptions.YarnException; -import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; -import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; -import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary; import java.io.File; import java.io.IOException; @@ -42,6 +28,22 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + @InterfaceAudience.Private @InterfaceStability.Unstable @@ -57,11 +59,10 @@ public class GpuDiscoverer { private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( "/usr/bin", "/bin", "/usr/local/nvidia/bin"); - // command should not run more than 10 sec. - private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_REPEATED_ERROR_ALLOWED = 10; private Configuration conf = null; + private NvidiaBinaryHelper nvidiaBinaryHelper; private String pathOfGpuBinary = null; private Map environment = new HashMap<>(); @@ -110,24 +111,17 @@ public class GpuDiscoverer { * @return GpuDeviceInformation * @throws YarnException when any error happens */ - synchronized GpuDeviceInformation getGpuDeviceInformation() + public synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { - validateConfOrThrowException(); - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { String msg = getErrorMessageOfScriptExecutionThresholdReached(); LOG.error(msg); throw new YarnException(msg); } - String output; try { - output = Shell.execCommand(environment, - new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); - GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); - lastDiscoveredGpuInformation = parser.parseXml(output); - numOfErrorExecutionSinceLastSucceed = 0; - return lastDiscoveredGpuInformation; + lastDiscoveredGpuInformation = + nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary); } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getErrorMessageOfScriptExecution(e.getMessage()); @@ -138,17 +132,18 @@ public class GpuDiscoverer { } catch (YarnException e) { numOfErrorExecutionSinceLastSucceed++; String msg = getFailedToParseErrorMessage(e.getMessage()); - if (LOG.isDebugEnabled()) { - LOG.warn(msg, e); - } + LOG.debug(msg, e); throw e; } + + return lastDiscoveredGpuInformation; } - private boolean IsAutoDiscoveryEnabled() { + private boolean isAutoDiscoveryEnabled() { String allowedDevicesStr = conf.get( YarnConfiguration.NM_GPU_ALLOWED_DEVICES, YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + return allowedDevicesStr.equals( YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); } @@ -157,13 +152,12 @@ public class GpuDiscoverer { * Get list of GPU devices usable by YARN. * * @return List of GPU devices - * @throws YarnException when any issue happens */ public synchronized List getGpusUsableByYarn() throws YarnException { validateConfOrThrowException(); - if (IsAutoDiscoveryEnabled()) { + if (isAutoDiscoveryEnabled()) { return parseGpuDevicesFromAutoDiscoveredGpuInfo(); } else { if (gpuDevicesFromUser == null) { @@ -219,16 +213,27 @@ public class GpuDiscoverer { if (device.trim().length() > 0) { String[] splitByColon = device.trim().split(":"); if (splitByColon.length != 2) { - throw GpuDeviceSpecificationException. - createWithWrongValueSpecified(device, devices); + throwIfNecessary(GpuDeviceSpecificationException + .createWithWrongValueSpecified(device, devices), conf); + LOG.warn("Wrong GPU specification string {}, ignored", device); + } + + GpuDevice gpuDevice; + try { + gpuDevice = parseGpuDevice(splitByColon); + } catch (NumberFormatException e) { + throwIfNecessary(GpuDeviceSpecificationException + .createWithWrongValueSpecified(device, devices, e), conf); + LOG.warn("Cannot parse GPU device numbers: {}", device); + continue; } - GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices); if (!gpuDevices.contains(gpuDevice)) { gpuDevices.add(gpuDevice); } else { - throw GpuDeviceSpecificationException - .createWithDuplicateValueSpecified(device, devices); + throwIfNecessary(GpuDeviceSpecificationException + .createWithDuplicateValueSpecified(device, devices), conf); + LOG.warn("CPU device is duplicated: {}", device); } } } @@ -237,22 +242,18 @@ public class GpuDiscoverer { return gpuDevices; } - private GpuDevice parseGpuDevice(String device, String[] splitByColon, - String allowedDevicesStr) throws YarnException { - try { - int index = Integer.parseInt(splitByColon[0]); - int minorNumber = Integer.parseInt(splitByColon[1]); - return new GpuDevice(index, minorNumber); - } catch (NumberFormatException e) { - throw GpuDeviceSpecificationException. - createWithWrongValueSpecified(device, allowedDevicesStr, e); - } + private GpuDevice parseGpuDevice(String[] splitByColon) { + int index = Integer.parseInt(splitByColon[0]); + int minorNumber = Integer.parseInt(splitByColon[1]); + return new GpuDevice(index, minorNumber); } - public synchronized void initialize(Configuration config) - throws YarnException { + + public synchronized void initialize(Configuration config, + NvidiaBinaryHelper nvidiaHelper) throws YarnException { this.conf = config; - if (IsAutoDiscoveryEnabled()) { + this.nvidiaBinaryHelper = nvidiaHelper; + if (isAutoDiscoveryEnabled()) { numOfErrorExecutionSinceLastSucceed = 0; lookUpAutoDiscoveryBinary(config); @@ -286,7 +287,18 @@ public class GpuDiscoverer { binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile); } else { binaryPath = configuredBinaryFile; + // If path exists but file name is incorrect don't execute the file + String fileName = binaryPath.getName(); + if (DEFAULT_BINARY_NAME.equals(fileName)) { + String msg = String.format("Please check the configuration value of" + +" %s. It should point to an %s binary.", + YarnConfiguration.NM_GPU_PATH_TO_EXEC, + DEFAULT_BINARY_NAME); + throwIfNecessary(new YarnException(msg), config); + LOG.warn(msg); + } } + pathOfGpuBinary = binaryPath.getAbsolutePath(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 4b2258d557f..afb0d7eda23 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -18,6 +18,9 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary; + +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; @@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { private static final Logger LOG = LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); private final GpuDiscoverer gpuDiscoverer; + private Configuration conf; - public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) { + public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer, + Configuration conf) { this.gpuDiscoverer = gpuDiscoverer; + this.conf = conf; } @Override @@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { "but could not find any usable GPUs on the NodeManager!"; LOG.error(message); // No gpu can be used by YARN. - throw new YarnException(message); + throwIfNecessary(new YarnException(message), conf); + return; } long nUsableGpus = usableGpus.size(); @@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { Map configuredResourceTypes = ResourceUtils.getResourceTypes(); if (!configuredResourceTypes.containsKey(GPU_URI)) { - throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " + LOG.warn("Found " + nUsableGpus + " usable GPUs, however " + GPU_URI + " resource-type is not configured inside" + " resource-types.xml, please configure it to enable GPU feature or" diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index 2b06f31f37b..d44160e8271 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; +import java.util.List; + import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; @@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; - -import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin { private final GpuNodeResourceUpdateHandler resourceDiscoverHandler; private final GpuDiscoverer gpuDiscoverer; + public static final int MAX_REPEATED_ERROR_ALLOWED = 10; + + private int numOfErrorExecutionSinceLastSucceed = 0; + private GpuResourceHandlerImpl gpuResourceHandler = null; private DockerCommandPlugin dockerCommandPlugin = null; @@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin { @Override public void initialize(Context context) throws YarnException { - this.gpuDiscoverer.initialize(context.getConf()); + this.gpuDiscoverer.initialize(context.getConf(), + new NvidiaBinaryHelper()); this.dockerCommandPlugin = GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( context.getConf()); @@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin { @Override public synchronized NMResourceInfo getNMResourceInfo() throws YarnException { - GpuDeviceInformation gpuDeviceInformation = - gpuDiscoverer.getGpuDeviceInformation(); + GpuDeviceInformation gpuDeviceInformation; //At this point the gpu plugin is already enabled checkGpuResourceHandler(); + checkErrorCount(); + try{ + gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation(); + numOfErrorExecutionSinceLastSucceed = 0; + } catch (YarnException e) { + LOG.error(e.getMessage(), e); + numOfErrorExecutionSinceLastSucceed++; + throw e; + } + GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); List totalGpus = gpuResourceAllocator.getAllowedGpus(); @@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin { } } + private void checkErrorCount() throws YarnException { + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + String msg = + "Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + + " times, skip following executions."; + LOG.error(msg); + throw new YarnException(msg); + } + } + @Override public String toString() { return GpuResourcePlugin.class.getName(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java new file mode 100644 index 00000000000..8efc32a8b13 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import java.io.IOException; +import java.util.HashMap; + +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; + +/** + * Executes the "nvidia-smi" command and returns an object + * based on its output. + * + */ +public class NvidiaBinaryHelper { + /** + * command should not run more than 10 sec. + */ + private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + + /** + * @param pathOfGpuBinary The path of the binary + * @return the GpuDeviceInformation parsed from the nvidia-smi output + * @throws IOException if the binary output is not readable + * @throws YarnException if the pathOfGpuBinary is null, + * or the output parse failed + */ + synchronized GpuDeviceInformation getGpuDeviceInformation( + String pathOfGpuBinary) throws IOException, YarnException { + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + + if (pathOfGpuBinary == null) { + throw new YarnException( + "Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); + } + + String output = Shell.execCommand(new HashMap<>(), + new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS); + return parser.parseXml(output); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index efd28eed357..5cb508af4ad 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; @@ -116,11 +117,13 @@ public class TestGpuResourceHandler { @Rule public ExpectedException expected = ExpectedException.none(); + private NvidiaBinaryHelper nvidiaBinaryHelper; + @Before public void setup() throws IOException { createTestDataDirectory(); - TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); + nvidiaBinaryHelper = new NvidiaBinaryHelper(); mockCGroupsHandler = mock(CGroupsHandler.class); mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class); @@ -146,13 +149,14 @@ public class TestGpuResourceHandler { @After public void cleanupTestFiles() throws IOException { FileUtils.deleteDirectory(testDataDirectory); + nvidiaBinaryHelper = new NvidiaBinaryHelper(); } @Test public void testBootstrapWithRealGpuDiscoverer() throws Exception { Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); gpuResourceHandler.bootstrap(conf); @@ -170,7 +174,7 @@ public class TestGpuResourceHandler { public void testBootstrapWithMockGpuDiscoverer() throws Exception { GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class); Configuration conf = new YarnConfiguration(); - mockDiscoverer.initialize(conf); + mockDiscoverer.initialize(conf, nvidiaBinaryHelper); expected.expect(ResourceHandlerException.class); gpuResourceHandler.bootstrap(conf); @@ -270,7 +274,7 @@ public class TestGpuResourceHandler { conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer = new GpuDiscoverer(); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); Context nmContext = createMockNmContext(conf); gpuResourceHandler = new GpuResourceHandlerImpl(nmContext, mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer); @@ -379,7 +383,7 @@ public class TestGpuResourceHandler { public void testAllocationWithoutAllowedGpus() throws Exception { Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); try { gpuResourceHandler.bootstrap(conf); @@ -460,7 +464,7 @@ public class TestGpuResourceHandler { new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer); - gpuDiscoverer.initialize(conf); + gpuDiscoverer.initialize(conf, nvidiaBinaryHelper); gpuNULLStateResourceHandler.bootstrap(conf); verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index f0f100c1f8b..8261895b2a9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -64,6 +64,7 @@ public class TestGpuDiscoverer { private static final String BASH_SHEBANG = "#!/bin/bash\n\n"; private static final String TEST_PARENT_DIR = new File("target/temp/" + TestGpuDiscoverer.class.getName()).getAbsolutePath(); + private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper(); @Rule public ExpectedException exception = ExpectedException.none(); @@ -150,7 +151,7 @@ public class TestGpuDiscoverer { Configuration conf) throws YarnException { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); return discoverer; } @@ -163,14 +164,14 @@ public class TestGpuDiscoverer { // test case 1, check default setting. Configuration conf = new Configuration(false); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary()); assertNvidiaIsOnPath(discoverer); // test case 2, check mandatory set path. File fakeBinary = setupFakeBinary(conf); discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); assertEquals(fakeBinary.getAbsolutePath(), discoverer.getPathOfGpuBinary()); assertNull(discoverer.getEnvironmentToRunCommand().get(PATH)); @@ -179,7 +180,7 @@ public class TestGpuDiscoverer { // but binary doesn't exist so default path will be used. fakeBinary.delete(); discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary()); assertNvidiaIsOnPath(discoverer); @@ -310,14 +311,14 @@ public class TestGpuDiscoverer { } @Test - public void testGpuDiscover() throws YarnException { + public void testGpuDiscover() throws YarnException, IOException { // Since this is more of a performance unit test, only run if // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true) Assume.assumeTrue( Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest"))); Configuration conf = new Configuration(false); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); GpuDeviceInformation info = discoverer.getGpuDeviceInformation(); assertTrue(info.getGpus().size() > 0); @@ -331,7 +332,7 @@ public class TestGpuDiscoverer { Configuration conf = createConfigWithAllowedDevices("1:2"); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); List usableGpuDevices = discoverer.getGpusUsableByYarn(); assertEquals(1, usableGpuDevices.size()); @@ -346,7 +347,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -354,7 +355,7 @@ public class TestGpuDiscoverer { public void testGetNumberOfUsableGpusFromConfig() throws YarnException { Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4"); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); List usableGpuDevices = discoverer.getGpusUsableByYarn(); assertEquals(4, usableGpuDevices.size()); @@ -379,7 +380,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -390,7 +391,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -401,7 +402,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -412,7 +413,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -423,7 +424,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -434,7 +435,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -445,7 +446,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -456,7 +457,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -467,7 +468,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -478,7 +479,7 @@ public class TestGpuDiscoverer { exception.expect(GpuDeviceSpecificationException.class); GpuDiscoverer discoverer = new GpuDiscoverer(); - discoverer.initialize(conf); + discoverer.initialize(conf, binaryHelper); discoverer.getGpusUsableByYarn(); } @@ -488,7 +489,7 @@ public class TestGpuDiscoverer { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla"); GpuDiscoverer plugin = new GpuDiscoverer(); try { - plugin.initialize(conf); + plugin.initialize(conf, binaryHelper); plugin.getGpusUsableByYarn(); fail("Illegal format, should fail."); } catch (YarnException e) { @@ -501,15 +502,15 @@ public class TestGpuDiscoverer { } @Test - public void testScriptNotCalled() throws YarnException { + public void testScriptNotCalled() throws YarnException, IOException { Configuration conf = new Configuration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3"); GpuDiscoverer gpuSpy = spy(new GpuDiscoverer()); - gpuSpy.initialize(conf); + gpuSpy.initialize(conf, binaryHelper); gpuSpy.getGpusUsableByYarn(); verify(gpuSpy, never()).getGpuDeviceInformation(); } -} +} \ No newline at end of file