From d6377c8b681d92252363a4de263771b2c77ca404 Mon Sep 17 00:00:00 2001 From: Sunil G Date: Fri, 22 Feb 2019 20:22:17 +0530 Subject: [PATCH] YARN-9118. Handle exceptions with parsing user defined GPU devices in GpuDiscoverer. Contributed by Szilard Nemeth. (cherry picked from commit 95fbbfed75dd309b5d56032ece64996165572287) --- .../resources/gpu/GpuResourceHandlerImpl.java | 5 +- .../gpu/GpuDeviceSpecificationException.java | 82 +++++++ .../resourceplugin/gpu/GpuDiscoverer.java | 136 +++++++----- .../gpu/GpuNodeResourceUpdateHandler.java | 10 +- .../resourceplugin/gpu/package-info.java | 20 ++ .../resourceplugin/gpu/TestGpuDiscoverer.java | 204 +++++++++++++++--- 6 files changed, 366 insertions(+), 91 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDeviceSpecificationException.java create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/package-info.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index e466742c6eb..6d158be71a8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -65,11 +65,10 @@ public class GpuResourceHandlerImpl implements ResourceHandler { throws ResourceHandlerException { List usableGpus; try { - usableGpus = GpuDiscoverer.getInstance() - .getGpusUsableByYarn(); + usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn(); if (usableGpus == null || usableGpus.isEmpty()) { String message = "GPU is enabled on the NodeManager, but couldn't find " - + "any usable GPU devices, please double check configuration."; + + "any usable GPU devices, please double check configuration!"; LOG.error(message); throw new ResourceHandlerException(message); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDeviceSpecificationException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDeviceSpecificationException.java new file mode 100644 index 00000000000..9d61b91a1f2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDeviceSpecificationException.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; + +/** + * This exception is to be thrown when allowed GPU device specification + * is empty or invalid. + */ +public final class GpuDeviceSpecificationException extends YarnException { + private static final String VALID_FORMAT_MESSAGE = "The valid format " + + "should be: index:minor_number"; + + private GpuDeviceSpecificationException(String message) { + super(message); + } + + private GpuDeviceSpecificationException(String message, Exception cause) { + super(message, cause); + } + + public static GpuDeviceSpecificationException createWithEmptyValueSpecified() { + return new GpuDeviceSpecificationException( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " is set to an empty value! Please specify " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + " to enable auto-discovery or " + + "please enter the GPU device IDs manually! " + + VALID_FORMAT_MESSAGE); + } + + public static GpuDeviceSpecificationException createWithWrongValueSpecified( + String device, String configValue, Exception cause) { + final String message = createIllegalFormatMessage(device, configValue); + return new GpuDeviceSpecificationException(message, cause); + } + + public static GpuDeviceSpecificationException createWithWrongValueSpecified( + String device, String configValue) { + final String message = createIllegalFormatMessage(device, configValue); + return new GpuDeviceSpecificationException(message); + } + + public static GpuDeviceSpecificationException createWithDuplicateValueSpecified( + String device, String configValue) { + final String message = createDuplicateFormatMessage(device, configValue); + return new GpuDeviceSpecificationException(message); + } + + private static String createIllegalFormatMessage(String device, + String configValue) { + return String.format("Illegal format of individual GPU device: %s, " + + "the whole config value was: '%s'! " + VALID_FORMAT_MESSAGE, + device, configValue); + } + + private static String createDuplicateFormatMessage(String device, + String configValue) { + return String.format("GPU device %s" + + " has a duplicate definition! " + + "Please double-check the configuration " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + "! Current value of the configuration is: %s", + device, configValue); + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 6e3cf1315ce..92792b7ba64 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -19,8 +19,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -47,7 +47,7 @@ public class GpuDiscoverer { public static final Logger LOG = LoggerFactory.getLogger( GpuDiscoverer.class); @VisibleForTesting - protected static final String DEFAULT_BINARY_NAME = "nvidia-smi"; + static final String DEFAULT_BINARY_NAME = "nvidia-smi"; // When executable path not set, try to search default dirs // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when @@ -70,7 +70,7 @@ public class GpuDiscoverer { private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); private int numOfErrorExecutionSinceLastSucceed = 0; - GpuDeviceInformation lastDiscoveredGpuInformation = null; + private GpuDeviceInformation lastDiscoveredGpuInformation = null; private void validateConfOrThrowException() throws YarnException { if (conf == null) { @@ -89,7 +89,7 @@ public class GpuDiscoverer { * @return GpuDeviceInformation * @throws YarnException when any error happens */ - public synchronized GpuDeviceInformation getGpuDeviceInformation() + synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { validateConfOrThrowException(); @@ -112,10 +112,9 @@ public class GpuDiscoverer { try { output = Shell.execCommand(environment, new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); - GpuDeviceInformation info = parser.parseXml(output); + lastDiscoveredGpuInformation = parser.parseXml(output); numOfErrorExecutionSinceLastSucceed = 0; - lastDiscoveredGpuInformation = info; - return info; + return lastDiscoveredGpuInformation; } catch (IOException e) { numOfErrorExecutionSinceLastSucceed++; String msg = @@ -149,52 +148,91 @@ public class GpuDiscoverer { YarnConfiguration.NM_GPU_ALLOWED_DEVICES, YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); - List gpuDevices = new ArrayList<>(); - if (allowedDevicesStr.equals( YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { - // Get gpu device information from system. - if (null == lastDiscoveredGpuInformation) { - String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " - + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES - + ", however automatically discovering " - + "GPU information failed, please check NodeManager log for more" - + " details, as an alternative, admin can specify " - + YarnConfiguration.NM_GPU_ALLOWED_DEVICES - + " manually to enable GPU isolation."; - LOG.error(msg); - throw new YarnException(msg); - } - - if (lastDiscoveredGpuInformation.getGpus() != null) { - for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size(); - i++) { - List gpuInfos = - lastDiscoveredGpuInformation.getGpus(); - gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); - } - } - } else{ - for (String s : allowedDevicesStr.split(",")) { - if (s.trim().length() > 0) { - String[] kv = s.trim().split(":"); - if (kv.length != 2) { - throw new YarnException( - "Illegal format, it should be index:minor_number format, now it=" - + s); - } - - gpuDevices.add( - new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1]))); - } - } - LOG.info("Allowed GPU devices:" + gpuDevices); + return parseGpuDevicesFromAutoDiscoveredGpuInfo(); + } else { + return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr); } + } + + private List parseGpuDevicesFromAutoDiscoveredGpuInfo() + throws YarnException { + if (lastDiscoveredGpuInformation == null) { + String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + ", however automatically discovering " + + "GPU information failed, please check NodeManager log for more" + + " details, as an alternative, admin can specify " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " manually to enable GPU isolation."; + LOG.error(msg); + throw new YarnException(msg); + } + + List gpuDevices = new ArrayList<>(); + if (lastDiscoveredGpuInformation.getGpus() != null) { + int numberOfGpus = lastDiscoveredGpuInformation.getGpus().size(); + LOG.debug("Found {} GPU devices", numberOfGpus); + for (int i = 0; i < numberOfGpus; i++) { + List gpuInfos = + lastDiscoveredGpuInformation.getGpus(); + gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); + } + } + return gpuDevices; + } + + /** + * @param devices allowed devices coming from the config. + * Individual devices should be separated by commas. + *
The format of individual devices should be: + * <index:><minorNumber> + * @return List of GpuDevices + * @throws YarnException when a GPU device is defined as a duplicate. + * The first duplicate GPU device will be added to the exception message. + */ + private List parseGpuDevicesFromUserDefinedValues(String devices) + throws YarnException { + if (devices.trim().isEmpty()) { + throw GpuDeviceSpecificationException.createWithEmptyValueSpecified(); + } + List gpuDevices = Lists.newArrayList(); + for (String device : devices.split(",")) { + if (device.trim().length() > 0) { + String[] splitByColon = device.trim().split(":"); + if (splitByColon.length != 2) { + throw GpuDeviceSpecificationException. + createWithWrongValueSpecified(device, devices); + } + + GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices); + if (!gpuDevices.contains(gpuDevice)) { + gpuDevices.add(gpuDevice); + } else { + throw GpuDeviceSpecificationException + .createWithDuplicateValueSpecified(device, devices); + } + } + } + LOG.info("Allowed GPU devices:" + gpuDevices); return gpuDevices; } - public synchronized void initialize(Configuration conf) throws YarnException { + private GpuDevice parseGpuDevice(String device, String[] splitByColon, + String allowedDevicesStr) throws YarnException { + try { + int index = Integer.parseInt(splitByColon[0]); + int minorNumber = Integer.parseInt(splitByColon[1]); + return new GpuDevice(index, minorNumber); + } catch (NumberFormatException e) { + throw GpuDeviceSpecificationException. + createWithWrongValueSpecified(device, allowedDevicesStr, e); + } + } + + public synchronized void initialize(Configuration conf) { this.conf = conf; numOfErrorExecutionSinceLastSucceed = 0; String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, @@ -203,9 +241,7 @@ public class GpuDiscoverer { pathToExecutable = DEFAULT_BINARY_NAME; } - // Validate file existence File binaryPath = new File(pathToExecutable); - if (!binaryPath.exists()) { // When binary not exist, use default setting. boolean found = false; @@ -249,12 +285,12 @@ public class GpuDiscoverer { } @VisibleForTesting - protected Map getEnvironmentToRunCommand() { + Map getEnvironmentToRunCommand() { return environment; } @VisibleForTesting - protected String getPathOfGpuBinary() { + String getPathOfGpuBinary() { return pathOfGpuBinary; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 796eb25b431..8b19048c8be 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -40,11 +40,11 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { public void updateConfiguredResource(Resource res) throws YarnException { LOG.info("Initializing configured GPU resources for the NodeManager."); - List usableGpus = - GpuDiscoverer.getInstance().getGpusUsableByYarn(); - if (null == usableGpus || usableGpus.isEmpty()) { - String message = "GPU is enabled, but couldn't find any usable GPUs on the " - + "NodeManager."; + List usableGpus = GpuDiscoverer.getInstance() + .getGpusUsableByYarn(); + if (usableGpus == null || usableGpus.isEmpty()) { + String message = "GPU is enabled, " + + "but couldn't find any usable GPUs on the NodeManager!"; LOG.error(message); // No gpu can be used by YARN. throw new YarnException(message); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/package-info.java new file mode 100644 index 00000000000..81f155f7b6e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Package for GPU support classes. + */ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 4abb633a69a..cbbfded374d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -23,17 +23,26 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; -import org.junit.Assert; import org.junit.Assume; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.List; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + public class TestGpuDiscoverer { + @Rule + public ExpectedException exception = ExpectedException.none(); + private String getTestParentFolder() { File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); return f.getAbsolutePath(); @@ -51,6 +60,12 @@ public class TestGpuDiscoverer { f.mkdirs(); } + private Configuration createConfigWithAllowedDevices(String s) { + Configuration conf = new Configuration(false); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s); + return conf; + } + @Test public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { // Only run this on demand. @@ -61,10 +76,10 @@ public class TestGpuDiscoverer { Configuration conf = new Configuration(false); GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); - Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); - Assert.assertTrue( + assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); + assertTrue( plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); // test case 2, check mandatory set path. @@ -74,18 +89,18 @@ public class TestGpuDiscoverer { conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(fakeBinary.getAbsolutePath(), + assertEquals(fakeBinary.getAbsolutePath(), plugin.getPathOfGpuBinary()); - Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); + assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); // test case 3, check mandatory set path, but binary doesn't exist so default // path will be used. fakeBinary.delete(); plugin = new GpuDiscoverer(); plugin.initialize(conf); - Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, plugin.getPathOfGpuBinary()); - Assert.assertTrue( + assertTrue( plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); } @@ -100,42 +115,165 @@ public class TestGpuDiscoverer { plugin.initialize(conf); GpuDeviceInformation info = plugin.getGpuDeviceInformation(); - Assert.assertTrue(info.getGpus().size() > 0); - Assert.assertEquals(plugin.getGpusUsableByYarn().size(), + assertTrue(info.getGpus().size() > 0); + assertEquals(plugin.getGpusUsableByYarn().size(), info.getGpus().size()); } @Test - public void getNumberOfUsableGpusFromConfig() throws YarnException { - Configuration conf = new Configuration(false); + public void testGetNumberOfUsableGpusFromConfigSingleDevice() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("1:2"); - // Illegal format - conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3"); GpuDiscoverer plugin = new GpuDiscoverer(); - try { - plugin.initialize(conf); - plugin.getGpusUsableByYarn(); - Assert.fail("Illegal format, should fail."); - } catch (YarnException e) { - // Expected - } + plugin.initialize(conf); + List usableGpuDevices = plugin.getGpusUsableByYarn(); + assertEquals(1, usableGpuDevices.size()); - // Valid format - conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); - plugin = new GpuDiscoverer(); + assertEquals(1, usableGpuDevices.get(0).getIndex()); + assertEquals(2, usableGpuDevices.get(0).getMinorNumber()); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIllegalFormat() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfig() throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4"); + GpuDiscoverer plugin = new GpuDiscoverer(); plugin.initialize(conf); List usableGpuDevices = plugin.getGpusUsableByYarn(); - Assert.assertEquals(4, usableGpuDevices.size()); + assertEquals(4, usableGpuDevices.size()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); - Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + assertEquals(0, usableGpuDevices.get(0).getIndex()); + assertEquals(0, usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); - Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); - Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); - Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + assertEquals(1, usableGpuDevices.get(1).getIndex()); + assertEquals(1, usableGpuDevices.get(1).getMinorNumber()); + + assertEquals(2, usableGpuDevices.get(2).getIndex()); + assertEquals(2, usableGpuDevices.get(2).getMinorNumber()); + + assertEquals(3, usableGpuDevices.get(3).getIndex()); + assertEquals(4, usableGpuDevices.get(3).getMinorNumber()); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigDuplicateValues() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,1:1"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigDuplicateValues2() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,1:1,2:2"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingSpaces() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0 : 0,1 : 1"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingGibberish() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0:@$1,1:1"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigIncludingLetters() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("x:0, 1:y"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigWithoutIndexNumber() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices(":0, :1"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigEmptyString() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices(""); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutComma() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0:0 0:1"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutComma2() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0.1 0.2"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + } + + @Test + public void testGetNumberOfUsableGpusFromConfigValueWithoutColonSeparator() + throws YarnException { + Configuration conf = createConfigWithAllowedDevices("0.1,0.2"); + + exception.expect(GpuDeviceSpecificationException.class); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); } }