diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 36fafefdbc4..c0ca23130c4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1581,9 +1581,6 @@ public class YarnConfiguration extends Configuration { public static final String NM_GPU_PATH_TO_EXEC = NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; - @Private - public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; - /** * Settings to control which implementation of docker plugin for GPU will be * used. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 334a86c2c82..95e51e52183 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugi import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -88,12 +89,6 @@ public class GpuDiscoverer { throws YarnException { validateConfOrThrowException(); - if (null == pathOfGpuBinary) { - throw new YarnException( - "Failed to find GPU discovery executable, please double check " - + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); - } - if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { String msg = "Failed to execute GPU device information detection script for " @@ -227,50 +222,17 @@ public class GpuDiscoverer { } } - public synchronized void initialize(Configuration conf) { - this.conf = conf; + public synchronized void initialize(Configuration config) + throws YarnException { + this.conf = config; numOfErrorExecutionSinceLastSucceed = 0; - String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, - YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); - if (pathToExecutable.isEmpty()) { - pathToExecutable = DEFAULT_BINARY_NAME; - } - - File binaryPath = new File(pathToExecutable); - if (!binaryPath.exists()) { - // When binary not exist, use default setting. - boolean found = false; - for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { - binaryPath = new File(dir, DEFAULT_BINARY_NAME); - if (binaryPath.exists()) { - found = true; - pathOfGpuBinary = binaryPath.getAbsolutePath(); - break; - } - } - - if (!found) { - LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath() - + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC - + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); - } - } else{ - // If path specified by user is a directory, use - if (binaryPath.isDirectory()) { - binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); - LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME - + " under the directory, updated path-to-executable:" + binaryPath - .getAbsolutePath()); - } - // Validated - pathOfGpuBinary = binaryPath.getAbsolutePath(); - } + lookUpAutoDiscoveryBinary(config); // Try to discover GPU information once and print try { LOG.info("Trying to discover GPU information ..."); GpuDeviceInformation info = getGpuDeviceInformation(); - LOG.info(info.toString()); + LOG.info("Discovered GPU information: " + info.toString()); } catch (YarnException e) { String msg = "Failed to discover GPU information from system, exception message:" @@ -279,6 +241,71 @@ public class GpuDiscoverer { } } + private void lookUpAutoDiscoveryBinary(Configuration config) + throws YarnException { + String configuredBinaryPath = config.get( + YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME); + if (configuredBinaryPath.isEmpty()) { + configuredBinaryPath = DEFAULT_BINARY_NAME; + } + + File binaryPath; + File configuredBinaryFile = new File(configuredBinaryPath); + if (!configuredBinaryFile.exists()) { + binaryPath = lookupBinaryInDefaultDirs(); + } else if (configuredBinaryFile.isDirectory()) { + binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile); + } else { + binaryPath = configuredBinaryFile; + } + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } + + private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile) + throws YarnException { + File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME); + if (!binaryPath.exists()) { + throw new YarnException("Failed to find GPU discovery executable, " + + "please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + + " setting. The setting points to a directory but " + + "no file found in the directory with name:" + DEFAULT_BINARY_NAME); + } else { + LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + + " under the directory, updated path-to-executable:" + + binaryPath.getAbsolutePath()); + } + return binaryPath; + } + + private File lookupBinaryInDefaultDirs() throws YarnException { + final File lookedUpBinary = lookupBinaryInDefaultDirsInternal(); + if (lookedUpBinary == null) { + throw new YarnException("Failed to find GPU discovery executable, " + + "please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + " setting. Also tried to find the executable " + + "in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS); + } + return lookedUpBinary; + } + + private File lookupBinaryInDefaultDirsInternal() { + Set triedBinaryPaths = Sets.newHashSet(); + for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { + File binaryPath = new File(dir, DEFAULT_BINARY_NAME); + if (binaryPath.exists()) { + return binaryPath; + } else { + triedBinaryPaths.add(binaryPath.getAbsolutePath()); + } + } + LOG.warn("Failed to locate GPU device discovery binary, tried paths: " + + triedBinaryPaths + "! Please double check the value of config " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + ". Using default binary: " + DEFAULT_BINARY_NAME); + + return null; + } + @VisibleForTesting Map getEnvironmentToRunCommand() { return environment; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index 0141c72af90..d5aae946d03 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; +import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; @@ -40,11 +41,14 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.junit.After; import org.apache.hadoop.yarn.util.resource.TestResourceUtils; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -72,9 +76,42 @@ public class TestGpuResourceHandler { private NMStateStoreService mockNMStateStore; private ConcurrentHashMap runningContainersMap; private GpuDiscoverer gpuDiscoverer; + private File testDataDirectory; + + public void createTestDataDirectory() throws IOException { + String testDirectoryPath = getTestParentDirectory(); + testDataDirectory = new File(testDirectoryPath); + FileUtils.deleteDirectory(testDataDirectory); + testDataDirectory.mkdirs(); + } + + private String getTestParentDirectory() { + File f = new File("target/temp/" + TestGpuResourceHandler.class.getName()); + return f.getAbsolutePath(); + } + + private void touchFile(File f) throws IOException { + new FileOutputStream(f).close(); + } + + private Configuration createDefaultConfig() throws IOException { + Configuration conf = new YarnConfiguration(); + File fakeBinary = setupFakeGpuDiscoveryBinary(); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, + fakeBinary.getAbsolutePath()); + return conf; + } + + private File setupFakeGpuDiscoveryBinary() throws IOException { + File fakeBinary = new File(getTestParentDirectory() + "/fake-nvidia-smi"); + touchFile(fakeBinary); + return fakeBinary; + } @Before - public void setup() { + public void setup() throws IOException { + createTestDataDirectory(); + TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); mockCGroupsHandler = mock(CGroupsHandler.class); @@ -91,9 +128,14 @@ public class TestGpuResourceHandler { mockPrivilegedExecutor, gpuDiscoverer); } + @After + public void cleanupTestFiles() throws IOException { + FileUtils.deleteDirectory(testDataDirectory); + } + @Test public void testBootStrap() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); gpuDiscoverer.initialize(conf); @@ -157,7 +199,7 @@ public class TestGpuResourceHandler { private void commonTestAllocation(boolean dockerContainerEnabled) throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer.initialize(conf); @@ -246,7 +288,7 @@ public class TestGpuResourceHandler { @Test public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer.initialize(conf); @@ -275,7 +317,7 @@ public class TestGpuResourceHandler { @Test public void testAllocationWithoutAllowedGpus() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); gpuDiscoverer.initialize(conf); @@ -310,7 +352,7 @@ public class TestGpuResourceHandler { @Test public void testAllocationStored() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer.initialize(conf); @@ -356,7 +398,7 @@ public class TestGpuResourceHandler { new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer); - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer.initialize(conf); @@ -376,7 +418,7 @@ public class TestGpuResourceHandler { @Test public void testRecoverResourceAllocation() throws Exception { - Configuration conf = new YarnConfiguration(); + Configuration conf = createDefaultConfig(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); gpuDiscoverer.initialize(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index cbbfded374d..ecc9c7bbf27 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -38,6 +38,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; public class TestGpuDiscoverer { @Rule @@ -52,6 +53,19 @@ public class TestGpuDiscoverer { new FileOutputStream(f).close(); } + private File setupFakeBinary(Configuration conf) { + File fakeBinary; + try { + fakeBinary = new File(getTestParentFolder(), + GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + } catch (Exception e) { + throw new RuntimeException("Failed to init fake binary", e); + } + return fakeBinary; + } + @Before public void before() throws IOException { String folder = getTestParentFolder(); @@ -63,6 +77,7 @@ public class TestGpuDiscoverer { private Configuration createConfigWithAllowedDevices(String s) { Configuration conf = new Configuration(false); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s); + setupFakeBinary(conf); return conf; } @@ -83,10 +98,7 @@ public class TestGpuDiscoverer { plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); // test case 2, check mandatory set path. - File fakeBinary = new File(getTestParentFolder(), - GpuDiscoverer.DEFAULT_BINARY_NAME); - touchFile(fakeBinary); - conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + File fakeBinary = setupFakeBinary(conf); plugin = new GpuDiscoverer(); plugin.initialize(conf); assertEquals(fakeBinary.getAbsolutePath(), @@ -276,4 +288,22 @@ public class TestGpuDiscoverer { plugin.initialize(conf); plugin.getGpusUsableByYarn(); } + + @Test + public void testGpuBinaryIsANotExistingFile() { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla"); + GpuDiscoverer plugin = new GpuDiscoverer(); + try { + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + fail("Illegal format, should fail."); + } catch (YarnException e) { + String message = e.getMessage(); + assertTrue(message.startsWith("Failed to find GPU discovery " + + "executable, please double check")); + assertTrue(message.contains("Also tried to find the " + + "executable in the default directories:")); + } + } }