YARN-9139. Simplify initializer code of GpuDiscoverer. Contributed by Szilard Nemeth.

This commit is contained in:
Sunil G 2019-03-01 19:27:03 +05:30
parent 3f3548b66a
commit d045f02a8d
4 changed files with 155 additions and 59 deletions

View File

@ -1620,9 +1620,6 @@ public class YarnConfiguration extends Configuration {
public static final String NM_GPU_PATH_TO_EXEC =
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
@Private
public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
/**
* Settings to control which implementation of docker plugin for GPU will be
* used.

View File

@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugi
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@ -88,12 +89,6 @@ public class GpuDiscoverer {
throws YarnException {
validateConfOrThrowException();
if (null == pathOfGpuBinary) {
throw new YarnException(
"Failed to find GPU discovery executable, please double check "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
}
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg =
"Failed to execute GPU device information detection script for "
@ -227,50 +222,17 @@ public class GpuDiscoverer {
}
}
public synchronized void initialize(Configuration conf) {
this.conf = conf;
public synchronized void initialize(Configuration config)
throws YarnException {
this.conf = config;
numOfErrorExecutionSinceLastSucceed = 0;
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
if (pathToExecutable.isEmpty()) {
pathToExecutable = DEFAULT_BINARY_NAME;
}
File binaryPath = new File(pathToExecutable);
if (!binaryPath.exists()) {
// When binary not exist, use default setting.
boolean found = false;
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
binaryPath = new File(dir, DEFAULT_BINARY_NAME);
if (binaryPath.exists()) {
found = true;
pathOfGpuBinary = binaryPath.getAbsolutePath();
break;
}
}
if (!found) {
LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
+ ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
+ "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
}
} else{
// If path specified by user is a directory, use
if (binaryPath.isDirectory()) {
binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ " under the directory, updated path-to-executable:" + binaryPath
.getAbsolutePath());
}
// Validated
pathOfGpuBinary = binaryPath.getAbsolutePath();
}
lookUpAutoDiscoveryBinary(config);
// Try to discover GPU information once and print
try {
LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info(info.toString());
LOG.info("Discovered GPU information: " + info.toString());
} catch (YarnException e) {
String msg =
"Failed to discover GPU information from system, exception message:"
@ -279,6 +241,71 @@ public class GpuDiscoverer {
}
}
private void lookUpAutoDiscoveryBinary(Configuration config)
throws YarnException {
String configuredBinaryPath = config.get(
YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME);
if (configuredBinaryPath.isEmpty()) {
configuredBinaryPath = DEFAULT_BINARY_NAME;
}
File binaryPath;
File configuredBinaryFile = new File(configuredBinaryPath);
if (!configuredBinaryFile.exists()) {
binaryPath = lookupBinaryInDefaultDirs();
} else if (configuredBinaryFile.isDirectory()) {
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
} else {
binaryPath = configuredBinaryFile;
}
pathOfGpuBinary = binaryPath.getAbsolutePath();
}
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
throws YarnException {
File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME);
if (!binaryPath.exists()) {
throw new YarnException("Failed to find GPU discovery executable, " +
"please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
" setting. The setting points to a directory but " +
"no file found in the directory with name:" + DEFAULT_BINARY_NAME);
} else {
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ " under the directory, updated path-to-executable:"
+ binaryPath.getAbsolutePath());
}
return binaryPath;
}
private File lookupBinaryInDefaultDirs() throws YarnException {
final File lookedUpBinary = lookupBinaryInDefaultDirsInternal();
if (lookedUpBinary == null) {
throw new YarnException("Failed to find GPU discovery executable, " +
"please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC +
" setting. Also tried to find the executable " +
"in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS);
}
return lookedUpBinary;
}
private File lookupBinaryInDefaultDirsInternal() {
Set<String> triedBinaryPaths = Sets.newHashSet();
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
File binaryPath = new File(dir, DEFAULT_BINARY_NAME);
if (binaryPath.exists()) {
return binaryPath;
} else {
triedBinaryPaths.add(binaryPath.getAbsolutePath());
}
}
LOG.warn("Failed to locate GPU device discovery binary, tried paths: "
+ triedBinaryPaths + "! Please double check the value of config "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
". Using default binary: " + DEFAULT_BINARY_NAME);
return null;
}
@VisibleForTesting
Map<String, String> getEnvironmentToRunCommand() {
return environment;

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
@ -40,11 +41,14 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.junit.After;
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
@ -72,9 +76,42 @@ public class TestGpuResourceHandler {
private NMStateStoreService mockNMStateStore;
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
private GpuDiscoverer gpuDiscoverer;
private File testDataDirectory;
public void createTestDataDirectory() throws IOException {
String testDirectoryPath = getTestParentDirectory();
testDataDirectory = new File(testDirectoryPath);
FileUtils.deleteDirectory(testDataDirectory);
testDataDirectory.mkdirs();
}
private String getTestParentDirectory() {
File f = new File("target/temp/" + TestGpuResourceHandler.class.getName());
return f.getAbsolutePath();
}
private void touchFile(File f) throws IOException {
new FileOutputStream(f).close();
}
private Configuration createDefaultConfig() throws IOException {
Configuration conf = new YarnConfiguration();
File fakeBinary = setupFakeGpuDiscoveryBinary();
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
fakeBinary.getAbsolutePath());
return conf;
}
private File setupFakeGpuDiscoveryBinary() throws IOException {
File fakeBinary = new File(getTestParentDirectory() + "/fake-nvidia-smi");
touchFile(fakeBinary);
return fakeBinary;
}
@Before
public void setup() {
public void setup() throws IOException {
createTestDataDirectory();
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
mockCGroupsHandler = mock(CGroupsHandler.class);
@ -94,9 +131,14 @@ public class TestGpuResourceHandler {
mockPrivilegedExecutor, gpuDiscoverer);
}
@After
public void cleanupTestFiles() throws IOException {
FileUtils.deleteDirectory(testDataDirectory);
}
@Test
public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
gpuDiscoverer.initialize(conf);
@ -161,7 +203,7 @@ public class TestGpuResourceHandler {
private void commonTestAllocation(boolean dockerContainerEnabled)
throws Exception {
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf);
@ -250,7 +292,7 @@ public class TestGpuResourceHandler {
@Test
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception {
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf);
@ -279,7 +321,7 @@ public class TestGpuResourceHandler {
@Test
public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
gpuDiscoverer.initialize(conf);
@ -314,7 +356,7 @@ public class TestGpuResourceHandler {
@Test
public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf);
@ -353,7 +395,7 @@ public class TestGpuResourceHandler {
public void testAllocationStoredWithNULLStateStore() throws Exception {
NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class);
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
Context nmnctx = mock(Context.class);
@ -382,7 +424,7 @@ public class TestGpuResourceHandler {
@Test
public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration();
Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf);

View File

@ -38,6 +38,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class TestGpuDiscoverer {
@Rule
@ -52,6 +53,19 @@ public class TestGpuDiscoverer {
new FileOutputStream(f).close();
}
private File setupFakeBinary(Configuration conf) {
File fakeBinary;
try {
fakeBinary = new File(getTestParentFolder(),
GpuDiscoverer.DEFAULT_BINARY_NAME);
touchFile(fakeBinary);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
} catch (Exception e) {
throw new RuntimeException("Failed to init fake binary", e);
}
return fakeBinary;
}
@Before
public void before() throws IOException {
String folder = getTestParentFolder();
@ -63,6 +77,7 @@ public class TestGpuDiscoverer {
private Configuration createConfigWithAllowedDevices(String s) {
Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
setupFakeBinary(conf);
return conf;
}
@ -83,10 +98,7 @@ public class TestGpuDiscoverer {
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
// test case 2, check mandatory set path.
File fakeBinary = new File(getTestParentFolder(),
GpuDiscoverer.DEFAULT_BINARY_NAME);
touchFile(fakeBinary);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
File fakeBinary = setupFakeBinary(conf);
plugin = new GpuDiscoverer();
plugin.initialize(conf);
assertEquals(fakeBinary.getAbsolutePath(),
@ -276,4 +288,22 @@ public class TestGpuDiscoverer {
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
}
@Test
public void testGpuBinaryIsANotExistingFile() {
Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
GpuDiscoverer plugin = new GpuDiscoverer();
try {
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
fail("Illegal format, should fail.");
} catch (YarnException e) {
String message = e.getMessage();
assertTrue(message.startsWith("Failed to find GPU discovery " +
"executable, please double check"));
assertTrue(message.contains("Also tried to find the " +
"executable in the default directories:"));
}
}
}