YARN-9139. Simplify initializer code of GpuDiscoverer. Contributed by Szilard Nemeth.

This commit is contained in:
Sunil G 2019-03-01 19:27:03 +05:30
parent 3f3548b66a
commit d045f02a8d
4 changed files with 155 additions and 59 deletions

View File

@ -1620,9 +1620,6 @@ public class YarnConfiguration extends Configuration {
public static final String NM_GPU_PATH_TO_EXEC = public static final String NM_GPU_PATH_TO_EXEC =
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
@Private
public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
/** /**
* Settings to control which implementation of docker plugin for GPU will be * Settings to control which implementation of docker plugin for GPU will be
* used. * used.

View File

@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugi
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -88,12 +89,6 @@ public class GpuDiscoverer {
throws YarnException { throws YarnException {
validateConfOrThrowException(); validateConfOrThrowException();
if (null == pathOfGpuBinary) {
throw new YarnException(
"Failed to find GPU discovery executable, please double check "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
}
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg = String msg =
"Failed to execute GPU device information detection script for " "Failed to execute GPU device information detection script for "
@ -227,50 +222,17 @@ public class GpuDiscoverer {
} }
} }
public synchronized void initialize(Configuration conf) { public synchronized void initialize(Configuration config)
this.conf = conf; throws YarnException {
this.conf = config;
numOfErrorExecutionSinceLastSucceed = 0; numOfErrorExecutionSinceLastSucceed = 0;
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, lookUpAutoDiscoveryBinary(config);
YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
if (pathToExecutable.isEmpty()) {
pathToExecutable = DEFAULT_BINARY_NAME;
}
File binaryPath = new File(pathToExecutable);
if (!binaryPath.exists()) {
// When binary not exist, use default setting.
boolean found = false;
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
binaryPath = new File(dir, DEFAULT_BINARY_NAME);
if (binaryPath.exists()) {
found = true;
pathOfGpuBinary = binaryPath.getAbsolutePath();
break;
}
}
if (!found) {
LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
+ ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
+ "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
}
} else{
// If path specified by user is a directory, use
if (binaryPath.isDirectory()) {
binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ " under the directory, updated path-to-executable:" + binaryPath
.getAbsolutePath());
}
// Validated
pathOfGpuBinary = binaryPath.getAbsolutePath();
}
// Try to discover GPU information once and print // Try to discover GPU information once and print
try { try {
LOG.info("Trying to discover GPU information ..."); LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation(); GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info(info.toString()); LOG.info("Discovered GPU information: " + info.toString());
} catch (YarnException e) { } catch (YarnException e) {
String msg = String msg =
"Failed to discover GPU information from system, exception message:" "Failed to discover GPU information from system, exception message:"
@ -279,6 +241,71 @@ public class GpuDiscoverer {
} }
} }
private void lookUpAutoDiscoveryBinary(Configuration config)
throws YarnException {
String configuredBinaryPath = config.get(
YarnConfiguration.NM_GPU_PATH_TO_EXEC, DEFAULT_BINARY_NAME);
if (configuredBinaryPath.isEmpty()) {
configuredBinaryPath = DEFAULT_BINARY_NAME;
}
File binaryPath;
File configuredBinaryFile = new File(configuredBinaryPath);
if (!configuredBinaryFile.exists()) {
binaryPath = lookupBinaryInDefaultDirs();
} else if (configuredBinaryFile.isDirectory()) {
binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
} else {
binaryPath = configuredBinaryFile;
}
pathOfGpuBinary = binaryPath.getAbsolutePath();
}
private File handleConfiguredBinaryPathIsDirectory(File configuredBinaryFile)
throws YarnException {
File binaryPath = new File(configuredBinaryFile, DEFAULT_BINARY_NAME);
if (!binaryPath.exists()) {
throw new YarnException("Failed to find GPU discovery executable, " +
"please double check "+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
" setting. The setting points to a directory but " +
"no file found in the directory with name:" + DEFAULT_BINARY_NAME);
} else {
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ " under the directory, updated path-to-executable:"
+ binaryPath.getAbsolutePath());
}
return binaryPath;
}
private File lookupBinaryInDefaultDirs() throws YarnException {
final File lookedUpBinary = lookupBinaryInDefaultDirsInternal();
if (lookedUpBinary == null) {
throw new YarnException("Failed to find GPU discovery executable, " +
"please double check " + YarnConfiguration.NM_GPU_PATH_TO_EXEC +
" setting. Also tried to find the executable " +
"in the default directories: " + DEFAULT_BINARY_SEARCH_DIRS);
}
return lookedUpBinary;
}
private File lookupBinaryInDefaultDirsInternal() {
Set<String> triedBinaryPaths = Sets.newHashSet();
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
File binaryPath = new File(dir, DEFAULT_BINARY_NAME);
if (binaryPath.exists()) {
return binaryPath;
} else {
triedBinaryPaths.add(binaryPath.getAbsolutePath());
}
}
LOG.warn("Failed to locate GPU device discovery binary, tried paths: "
+ triedBinaryPaths + "! Please double check the value of config "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC +
". Using default binary: " + DEFAULT_BINARY_NAME);
return null;
}
@VisibleForTesting @VisibleForTesting
Map<String, String> getEnvironmentToRunCommand() { Map<String, String> getEnvironmentToRunCommand() {
return environment; return environment;

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
@ -40,11 +41,14 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.junit.After;
import org.apache.hadoop.yarn.util.resource.TestResourceUtils; import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -72,9 +76,42 @@ public class TestGpuResourceHandler {
private NMStateStoreService mockNMStateStore; private NMStateStoreService mockNMStateStore;
private ConcurrentHashMap<ContainerId, Container> runningContainersMap; private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
private GpuDiscoverer gpuDiscoverer; private GpuDiscoverer gpuDiscoverer;
private File testDataDirectory;
public void createTestDataDirectory() throws IOException {
String testDirectoryPath = getTestParentDirectory();
testDataDirectory = new File(testDirectoryPath);
FileUtils.deleteDirectory(testDataDirectory);
testDataDirectory.mkdirs();
}
private String getTestParentDirectory() {
File f = new File("target/temp/" + TestGpuResourceHandler.class.getName());
return f.getAbsolutePath();
}
private void touchFile(File f) throws IOException {
new FileOutputStream(f).close();
}
private Configuration createDefaultConfig() throws IOException {
Configuration conf = new YarnConfiguration();
File fakeBinary = setupFakeGpuDiscoveryBinary();
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
fakeBinary.getAbsolutePath());
return conf;
}
private File setupFakeGpuDiscoveryBinary() throws IOException {
File fakeBinary = new File(getTestParentDirectory() + "/fake-nvidia-smi");
touchFile(fakeBinary);
return fakeBinary;
}
@Before @Before
public void setup() { public void setup() throws IOException {
createTestDataDirectory();
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
mockCGroupsHandler = mock(CGroupsHandler.class); mockCGroupsHandler = mock(CGroupsHandler.class);
@ -94,9 +131,14 @@ public class TestGpuResourceHandler {
mockPrivilegedExecutor, gpuDiscoverer); mockPrivilegedExecutor, gpuDiscoverer);
} }
@After
public void cleanupTestFiles() throws IOException {
FileUtils.deleteDirectory(testDataDirectory);
}
@Test @Test
public void testBootStrap() throws Exception { public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf);
@ -161,7 +203,7 @@ public class TestGpuResourceHandler {
private void commonTestAllocation(boolean dockerContainerEnabled) private void commonTestAllocation(boolean dockerContainerEnabled)
throws Exception { throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf);
@ -250,7 +292,7 @@ public class TestGpuResourceHandler {
@Test @Test
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception { throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf);
@ -279,7 +321,7 @@ public class TestGpuResourceHandler {
@Test @Test
public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf);
@ -314,7 +356,7 @@ public class TestGpuResourceHandler {
@Test @Test
public void testAllocationStored() throws Exception { public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf);
@ -353,7 +395,7 @@ public class TestGpuResourceHandler {
public void testAllocationStoredWithNULLStateStore() throws Exception { public void testAllocationStoredWithNULLStateStore() throws Exception {
NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class); NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class);
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
Context nmnctx = mock(Context.class); Context nmnctx = mock(Context.class);
@ -382,7 +424,7 @@ public class TestGpuResourceHandler {
@Test @Test
public void testRecoverResourceAllocation() throws Exception { public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration(); Configuration conf = createDefaultConfig();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
gpuDiscoverer.initialize(conf); gpuDiscoverer.initialize(conf);

View File

@ -38,6 +38,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull; import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class TestGpuDiscoverer { public class TestGpuDiscoverer {
@Rule @Rule
@ -52,6 +53,19 @@ public class TestGpuDiscoverer {
new FileOutputStream(f).close(); new FileOutputStream(f).close();
} }
private File setupFakeBinary(Configuration conf) {
File fakeBinary;
try {
fakeBinary = new File(getTestParentFolder(),
GpuDiscoverer.DEFAULT_BINARY_NAME);
touchFile(fakeBinary);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
} catch (Exception e) {
throw new RuntimeException("Failed to init fake binary", e);
}
return fakeBinary;
}
@Before @Before
public void before() throws IOException { public void before() throws IOException {
String folder = getTestParentFolder(); String folder = getTestParentFolder();
@ -63,6 +77,7 @@ public class TestGpuDiscoverer {
private Configuration createConfigWithAllowedDevices(String s) { private Configuration createConfigWithAllowedDevices(String s) {
Configuration conf = new Configuration(false); Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
setupFakeBinary(conf);
return conf; return conf;
} }
@ -83,10 +98,7 @@ public class TestGpuDiscoverer {
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
// test case 2, check mandatory set path. // test case 2, check mandatory set path.
File fakeBinary = new File(getTestParentFolder(), File fakeBinary = setupFakeBinary(conf);
GpuDiscoverer.DEFAULT_BINARY_NAME);
touchFile(fakeBinary);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
plugin = new GpuDiscoverer(); plugin = new GpuDiscoverer();
plugin.initialize(conf); plugin.initialize(conf);
assertEquals(fakeBinary.getAbsolutePath(), assertEquals(fakeBinary.getAbsolutePath(),
@ -276,4 +288,22 @@ public class TestGpuDiscoverer {
plugin.initialize(conf); plugin.initialize(conf);
plugin.getGpusUsableByYarn(); plugin.getGpusUsableByYarn();
} }
@Test
public void testGpuBinaryIsANotExistingFile() {
Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
GpuDiscoverer plugin = new GpuDiscoverer();
try {
plugin.initialize(conf);
plugin.getGpusUsableByYarn();
fail("Illegal format, should fail.");
} catch (YarnException e) {
String message = e.getMessage();
assertTrue(message.startsWith("Failed to find GPU discovery " +
"executable, please double check"));
assertTrue(message.contains("Also tried to find the " +
"executable in the default directories:"));
}
}
} }