YARN-9139. Simplify initializer code of GpuDiscoverer. Contributed by Szilard Nemeth.
This commit is contained in:
parent
3f3548b66a
commit
d045f02a8d
|
@ -1620,9 +1620,6 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final String NM_GPU_PATH_TO_EXEC =
|
||||
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
|
||||
|
||||
@Private
|
||||
public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
|
||||
|
||||
/**
|
||||
* Settings to control which implementation of docker plugin for GPU will be
|
||||
* used.
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugi
|
|||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -88,12 +89,6 @@ public class GpuDiscoverer {
|
|||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
if (null == pathOfGpuBinary) {
|
||||
throw new YarnException(
|
||||
"Failed to find GPU discovery executable, please double check "
|
||||
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
|
||||
}
|
||||
|
||||
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
||||
String msg =
|
||||
"Failed to execute GPU device information detection script for "
|
||||
|
@ -227,50 +222,17 @@ public class GpuDiscoverer {
|
|||
}
|
||||
}
|
||||
|
||||
public synchronized void initialize(Configuration conf) {
|
||||
this.conf = conf;
|
||||
public synchronized void initialize(Configuration config)
|
||||
throws YarnException {
|
||||
this.conf = config;
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
||||
YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
|
||||
if (pathToExecutable.isEmpty()) {
|
||||
pathToExecutable = DEFAULT_BINARY_NAME;
|
||||
}
|
||||
|
||||
File binaryPath = new File(pathToExecutable);
|
||||
if (!binaryPath.exists()) {
|
||||
// When binary not exist, use default setting.
|
||||
boolean found = false;
|
||||
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
|
||||
binaryPath = new File(dir, DEFAULT_BINARY_NAME);
|
||||
if (binaryPath.exists()) {
|
||||
found = true;
|
||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
|
||||
+ ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
|
||||
+ "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
|
||||
}
|
||||
} else{
|
||||
// If path specified by user is a directory, use
|
||||
if (binaryPath.isDirectory()) {
|
||||
binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
|
||||
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
|
||||
+ " under the directory, updated path-to-executable:" + binaryPath
|
||||
.getAbsolutePath());
|
||||
}
|
||||
// Validated
|
||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||
}
|
||||
lookUpAutoDiscoveryBinary(config);
|
||||
|
||||
// Try to discover GPU information once and print
|
||||
try {
|
||||
LOG.info("Trying to discover GPU information ...");
|
||||
GpuDeviceInformation info = getGpuDeviceInformation();
|
||||
LOG.info(info.toString());
|
||||
LOG.info("Discovered GPU information: " + info.toString());
|
||||
} catch (YarnException e) {
|
||||
String msg =
|
||||
"Failed to discover GPU information from system, exception message:"
|
||||
|
@ -279,6 +241,71 @@ public class GpuDiscoverer {
|
|||
}
|
||||
}
|
||||
|
||||
private void lookUpAutoDiscoveryBinary(Configuration config)
|
||||
throws YarnException {
|
||||
String configuredBinaryPath = config.get(
|
||||
YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME);
|
||||
if (configuredBinaryPath.isEmpty()) {
|
||||
configuredBinaryPath = DEFAULT_BINARY_NAME;
|
||||
}
|
||||
|
||||
File binaryPath;
|
||||
File configuredBinaryFile = new File(configuredBinaryPath);
|
||||
if (!configuredBinaryFile.exists()) {
|
||||
binaryPath = lookupBinaryInDefaultDirs();
|
||||
} else if (configuredBinaryFile.isDirectory()) {
|
||||
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
|
||||
} else {
|
||||
binaryPath = configuredBinaryFile;
|
||||
}
|
||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||
}
|
||||
|
||||
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
|
||||
throws YarnException {
|
||||
File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME);
|
||||
if (!binaryPath.exists()) {
|
||||
throw new YarnException("Failed to find GPU discovery executable, " +
|
||||
"please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
|
||||
" setting. The setting points to a directory but " +
|
||||
"no file found in the directory with name:" + DEFAULT_BINARY_NAME);
|
||||
} else {
|
||||
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
|
||||
+ " under the directory, updated path-to-executable:"
|
||||
+ binaryPath.getAbsolutePath());
|
||||
}
|
||||
return binaryPath;
|
||||
}
|
||||
|
||||
private File lookupBinaryInDefaultDirs() throws YarnException {
|
||||
final File lookedUpBinary = lookupBinaryInDefaultDirsInternal();
|
||||
if (lookedUpBinary == null) {
|
||||
throw new YarnException("Failed to find GPU discovery executable, " +
|
||||
"please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC +
|
||||
" setting. Also tried to find the executable " +
|
||||
"in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS);
|
||||
}
|
||||
return lookedUpBinary;
|
||||
}
|
||||
|
||||
private File lookupBinaryInDefaultDirsInternal() {
|
||||
Set<String> triedBinaryPaths = Sets.newHashSet();
|
||||
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
|
||||
File binaryPath = new File(dir, DEFAULT_BINARY_NAME);
|
||||
if (binaryPath.exists()) {
|
||||
return binaryPath;
|
||||
} else {
|
||||
triedBinaryPaths.add(binaryPath.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
LOG.warn("Failed to locate GPU device discovery binary, tried paths: "
|
||||
+ triedBinaryPaths + "! Please double check the value of config "
|
||||
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
|
||||
". Using default binary: " + DEFAULT_BINARY_NAME);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
Map<String, String> getEnvironmentToRunCommand() {
|
||||
return environment;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
|
@ -40,11 +41,14 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
import org.junit.After;
|
||||
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -72,9 +76,42 @@ public class TestGpuResourceHandler {
|
|||
private NMStateStoreService mockNMStateStore;
|
||||
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
|
||||
private GpuDiscoverer gpuDiscoverer;
|
||||
private File testDataDirectory;
|
||||
|
||||
public void createTestDataDirectory() throws IOException {
|
||||
String testDirectoryPath = getTestParentDirectory();
|
||||
testDataDirectory = new File(testDirectoryPath);
|
||||
FileUtils.deleteDirectory(testDataDirectory);
|
||||
testDataDirectory.mkdirs();
|
||||
}
|
||||
|
||||
private String getTestParentDirectory() {
|
||||
File f = new File("target/temp/" + TestGpuResourceHandler.class.getName());
|
||||
return f.getAbsolutePath();
|
||||
}
|
||||
|
||||
private void touchFile(File f) throws IOException {
|
||||
new FileOutputStream(f).close();
|
||||
}
|
||||
|
||||
private Configuration createDefaultConfig() throws IOException {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
File fakeBinary = setupFakeGpuDiscoveryBinary();
|
||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
||||
fakeBinary.getAbsolutePath());
|
||||
return conf;
|
||||
}
|
||||
|
||||
private File setupFakeGpuDiscoveryBinary() throws IOException {
|
||||
File fakeBinary = new File(getTestParentDirectory() + "/fake-nvidia-smi");
|
||||
touchFile(fakeBinary);
|
||||
return fakeBinary;
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
public void setup() throws IOException {
|
||||
createTestDataDirectory();
|
||||
|
||||
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
|
||||
|
||||
mockCGroupsHandler = mock(CGroupsHandler.class);
|
||||
|
@ -94,9 +131,14 @@ public class TestGpuResourceHandler {
|
|||
mockPrivilegedExecutor, gpuDiscoverer);
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanupTestFiles() throws IOException {
|
||||
FileUtils.deleteDirectory(testDataDirectory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBootStrap() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
@ -161,7 +203,7 @@ public class TestGpuResourceHandler {
|
|||
|
||||
private void commonTestAllocation(boolean dockerContainerEnabled)
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
|
@ -250,7 +292,7 @@ public class TestGpuResourceHandler {
|
|||
@Test
|
||||
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
|
@ -279,7 +321,7 @@ public class TestGpuResourceHandler {
|
|||
|
||||
@Test
|
||||
public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
|
@ -314,7 +356,7 @@ public class TestGpuResourceHandler {
|
|||
|
||||
@Test
|
||||
public void testAllocationStored() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
|
@ -353,7 +395,7 @@ public class TestGpuResourceHandler {
|
|||
public void testAllocationStoredWithNULLStateStore() throws Exception {
|
||||
NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class);
|
||||
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
|
||||
Context nmnctx = mock(Context.class);
|
||||
|
@ -382,7 +424,7 @@ public class TestGpuResourceHandler {
|
|||
|
||||
@Test
|
||||
public void testRecoverResourceAllocation() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
Configuration conf = createDefaultConfig();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
gpuDiscoverer.initialize(conf);
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ import static org.junit.Assert.assertEquals;
|
|||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
public class TestGpuDiscoverer {
|
||||
@Rule
|
||||
|
@ -52,6 +53,19 @@ public class TestGpuDiscoverer {
|
|||
new FileOutputStream(f).close();
|
||||
}
|
||||
|
||||
private File setupFakeBinary(Configuration conf) {
|
||||
File fakeBinary;
|
||||
try {
|
||||
fakeBinary = new File(getTestParentFolder(),
|
||||
GpuDiscoverer.DEFAULT_BINARY_NAME);
|
||||
touchFile(fakeBinary);
|
||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to init fake binary", e);
|
||||
}
|
||||
return fakeBinary;
|
||||
}
|
||||
|
||||
@Before
|
||||
public void before() throws IOException {
|
||||
String folder = getTestParentFolder();
|
||||
|
@ -63,6 +77,7 @@ public class TestGpuDiscoverer {
|
|||
private Configuration createConfigWithAllowedDevices(String s) {
|
||||
Configuration conf = new Configuration(false);
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
|
||||
setupFakeBinary(conf);
|
||||
return conf;
|
||||
}
|
||||
|
||||
|
@ -83,10 +98,7 @@ public class TestGpuDiscoverer {
|
|||
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
||||
|
||||
// test case 2, check mandatory set path.
|
||||
File fakeBinary = new File(getTestParentFolder(),
|
||||
GpuDiscoverer.DEFAULT_BINARY_NAME);
|
||||
touchFile(fakeBinary);
|
||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
|
||||
File fakeBinary = setupFakeBinary(conf);
|
||||
plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
assertEquals(fakeBinary.getAbsolutePath(),
|
||||
|
@ -276,4 +288,22 @@ public class TestGpuDiscoverer {
|
|||
plugin.initialize(conf);
|
||||
plugin.getGpusUsableByYarn();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGpuBinaryIsANotExistingFile() {
|
||||
Configuration conf = new Configuration(false);
|
||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
|
||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||
try {
|
||||
plugin.initialize(conf);
|
||||
plugin.getGpusUsableByYarn();
|
||||
fail("Illegal format, should fail.");
|
||||
} catch (YarnException e) {
|
||||
String message = e.getMessage();
|
||||
assertTrue(message.startsWith("Failed to find GPU discovery " +
|
||||
"executable, please double check"));
|
||||
assertTrue(message.contains("Also tried to find the " +
|
||||
"executable in the default directories:"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue