diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java index 2496ac851ce..274e0f175dc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java @@ -19,6 +19,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import org.apache.commons.logging.Log; @@ -38,18 +40,17 @@ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; +import java.util.stream.Collectors; import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; /** - * Allocate GPU resources according to requirements + * Allocate GPU resources according to requirements. */ public class GpuResourceAllocator { final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); @@ -58,13 +59,23 @@ public class GpuResourceAllocator { private Set allowedGpuDevices = new TreeSet<>(); private Map usedDevices = new TreeMap<>(); private Context nmContext; + private final int waitPeriodForResource; public GpuResourceAllocator(Context ctx) { this.nmContext = ctx; + // Wait for a maximum of 120 seconds if no available GPU are there which + // are yet to be released. + this.waitPeriodForResource = 120 * WAIT_MS_PER_LOOP; + } + + @VisibleForTesting + GpuResourceAllocator(Context ctx, int waitPeriodForResource) { + this.nmContext = ctx; + this.waitPeriodForResource = waitPeriodForResource; } /** - * Contains allowed and denied devices + * Contains allowed and denied devices. * Denied devices will be useful for cgroups devices module to do blacklisting */ static class GpuAllocation { @@ -90,20 +101,13 @@ public class GpuResourceAllocator { } /** - * Add GPU to allowed list + * Add GPU to the allowed list of GPUs. * @param gpuDevice gpu device */ public synchronized void addGpu(GpuDevice gpuDevice) { allowedGpuDevices.add(gpuDevice); } - private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices, - ContainerId containerId) { - return "Failed to find enough GPUs, requestor=" + containerId - + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus=" - + getAvailableGpus(); - } - @VisibleForTesting public synchronized int getAvailableGpus() { return allowedGpuDevices.size() - usedDevices.size(); @@ -112,10 +116,10 @@ public class GpuResourceAllocator { public synchronized void recoverAssignedGpus(ContainerId containerId) throws ResourceHandlerException { Container c = nmContext.getContainers().get(containerId); - if (null == c) { + if (c == null) { throw new ResourceHandlerException( - "This shouldn't happen, cannot find container with id=" - + containerId); + "Cannot find container with id=" + containerId + + ", this should not occur under normal circumstances!"); } for (Serializable gpuDeviceSerializable : c.getResourceMappings() @@ -123,7 +127,8 @@ public class GpuResourceAllocator { if (!(gpuDeviceSerializable instanceof GpuDevice)) { throw new ResourceHandlerException( "Trying to recover device id, however it" - + " is not GpuDevice, this shouldn't happen"); + + " is not an instance of " + GpuDevice.class.getName() + + ", this should not occur under normal circumstances!"); } GpuDevice gpuDevice = (GpuDevice) gpuDeviceSerializable; @@ -132,8 +137,8 @@ public class GpuResourceAllocator { if (!allowedGpuDevices.contains(gpuDevice)) { throw new ResourceHandlerException( "Try to recover device = " + gpuDevice - + " however it is not in allowed device list:" + StringUtils - .join(",", allowedGpuDevices)); + + " however it is not in the allowed device list:" + + StringUtils.join(",", allowedGpuDevices)); } // Make sure it is not occupied by anybody else @@ -163,7 +168,7 @@ public class GpuResourceAllocator { } /** - * Assign GPU to requestor + * Assign GPU to the specified container. * @param container container to allocate * @return allocation results. * @throws ResourceHandlerException When failed to assign GPUs. @@ -172,12 +177,11 @@ public class GpuResourceAllocator { throws ResourceHandlerException { GpuAllocation allocation = internalAssignGpus(container); - // Wait for a maximum of 120 seconds if no available GPU are there which - // are yet to be released. - final int timeoutMsecs = 120 * WAIT_MS_PER_LOOP; + // Wait for a maximum of waitPeriodForResource seconds if no + // available GPU are there which are yet to be released. int timeWaiting = 0; while (allocation == null) { - if (timeWaiting >= timeoutMsecs) { + if (timeWaiting >= waitPeriodForResource) { break; } @@ -191,6 +195,8 @@ public class GpuResourceAllocator { allocation = internalAssignGpus(container); } catch (InterruptedException e) { // On any interrupt, break the loop and continue execution. + Thread.currentThread().interrupt(); + LOG.warn("Interrupted while waiting for available GPU"); break; } } @@ -210,8 +216,15 @@ public class GpuResourceAllocator { Resource requestedResource = container.getResource(); ContainerId containerId = container.getContainerId(); int numRequestedGpuDevices = getRequestedGpus(requestedResource); - // Assign Gpus to container if requested some. + + // Assign GPUs to container if requested some. if (numRequestedGpuDevices > 0) { + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Trying to assign %d GPUs to container: %s" + + ", #AvailableGPUs=%d, #ReleasingGPUs=%d", + numRequestedGpuDevices, containerId, + getAvailableGpus(), getReleasingGpus())); + } if (numRequestedGpuDevices > getAvailableGpus()) { // If there are some devices which are getting released, wait for few // seconds to get it. @@ -222,8 +235,9 @@ public class GpuResourceAllocator { if (numRequestedGpuDevices > getAvailableGpus()) { throw new ResourceHandlerException( - getResourceHandlerExceptionMessage(numRequestedGpuDevices, - containerId)); + "Failed to find enough GPUs, requestor=" + containerId + + ", #RequestedGPUs=" + numRequestedGpuDevices + + ", #AvailableGPUs=" + getAvailableGpus()); } Set assignedGpus = new TreeSet<>(); @@ -245,7 +259,7 @@ public class GpuResourceAllocator { nmContext.getNMStateStore().storeAssignedResources(container, GPU_URI, new ArrayList<>(assignedGpus)); } catch (IOException e) { - cleanupAssignGpus(containerId); + unassignGpus(containerId); throw new ResourceHandlerException(e); } } @@ -271,35 +285,34 @@ public class GpuResourceAllocator { } /** - * Clean up all Gpus assigned to containerId + * Clean up all GPUs assigned to containerId. * @param containerId containerId */ - public synchronized void cleanupAssignGpus(ContainerId containerId) { - Iterator> iter = - usedDevices.entrySet().iterator(); - while (iter.hasNext()) { - if (iter.next().getValue().equals(containerId)) { - iter.remove(); - } + public synchronized void unassignGpus(ContainerId containerId) { + if (LOG.isDebugEnabled()) { + LOG.debug("Trying to unassign GPU device from container " + containerId); } + usedDevices.entrySet().removeIf(entry -> + entry.getValue().equals(containerId)); } @VisibleForTesting - public synchronized Map getDeviceAllocationMappingCopy() { - return new HashMap<>(usedDevices); + public synchronized Map getDeviceAllocationMapping() { + return ImmutableMap.copyOf(usedDevices); } - public synchronized List getAllowedGpusCopy() { - return new ArrayList<>(allowedGpuDevices); + public synchronized List getAllowedGpus() { + return ImmutableList.copyOf(allowedGpuDevices); } - public synchronized List getAssignedGpusCopy() { - List assigns = new ArrayList<>(); - for (Map.Entry entry : usedDevices.entrySet()) { - assigns.add(new AssignedGpuDevice(entry.getKey().getIndex(), - entry.getKey().getMinorNumber(), entry.getValue())); - } - return assigns; + public synchronized List getAssignedGpus() { + return usedDevices.entrySet().stream() + .map(e -> { + final GpuDevice gpu = e.getKey(); + ContainerId containerId = e.getValue(); + return new AssignedGpuDevice(gpu.getIndex(), gpu.getMinorNumber(), + containerId); + }).collect(Collectors.toList()); } @Override diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index 2c9baf23051..bcade9ead4e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -177,7 +177,7 @@ public class GpuResourceHandlerImpl implements ResourceHandler { @Override public synchronized List postComplete( ContainerId containerId) throws ResourceHandlerException { - gpuAllocator.cleanupAssignGpus(containerId); + gpuAllocator.unassignGpus(containerId); cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, containerId.toString()); return null; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index 7719afb682c..2b06f31f37b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -97,9 +97,9 @@ public class GpuResourcePlugin implements ResourcePlugin { GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); - List totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); + List totalGpus = gpuResourceAllocator.getAllowedGpus(); List assignedGpuDevices = - gpuResourceAllocator.getAssignedGpusCopy(); + gpuResourceAllocator.getAssignedGpus(); return new NMGpuResourceInfo(gpuDeviceInformation, totalGpus, assignedGpuDevices); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceAllocator.java new file mode 100644 index 00000000000..c7c65ec8769 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceAllocator.java @@ -0,0 +1,449 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.anyList; +import static org.mockito.Mockito.anyString; +import static org.mockito.Mockito.argThat; +import static org.mockito.Mockito.eq; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyZeroInteractions; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.io.Serializable; +import java.util.HashSet; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator.GpuAllocation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.apache.hadoop.yarn.util.resource.TestResourceUtils; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.mockito.ArgumentCaptor; +import org.mockito.ArgumentMatcher; +import org.mockito.Captor; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import com.google.common.collect.Lists; + +/** + * Unit tests for GpuResourceAllocator. + */ +public class TestGpuResourceAllocator { + private static final int WAIT_PERIOD_FOR_RESOURCE = 100; + + private static class ContainerMatcher extends ArgumentMatcher { + + private Container container; + + ContainerMatcher(Container container) { + this.container = container; + } + + @Override + public boolean matches(Object o) { + if (!(o instanceof Container)) { + return false; + } + + Container other = (Container) o; + + long expectedId = container.getContainerId().getContainerId(); + long otherId = other.getContainerId().getContainerId(); + return expectedId == otherId; + } + } + + + @Captor + private ArgumentCaptor> gpuCaptor; + + @Mock + private NMContext nmContext; + + @Mock + private NMStateStoreService nmStateStore; + + @Rule + public ExpectedException exception = ExpectedException.none(); + + private GpuResourceAllocator testSubject; + + @Before + public void setup() { + TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); + MockitoAnnotations.initMocks(this); + testSubject = createTestSubject(WAIT_PERIOD_FOR_RESOURCE); + } + + private GpuResourceAllocator createTestSubject(int waitPeriodForResource) { + when(nmContext.getNMStateStore()).thenReturn(nmStateStore); + when(nmContext.getContainers()).thenReturn(new ConcurrentHashMap<>()); + return new GpuResourceAllocator(nmContext, waitPeriodForResource); + } + + private Resource createGpuResourceRequest(int gpus) { + Resource res = Resource.newInstance(1024, 1); + + if (gpus > 0) { + res.setResourceValue(ResourceInformation.GPU_URI, gpus); + } + return res; + } + + private List createMockContainers(int gpus, + int numberOfContainers) { + final long id = 111L; + + List containers = Lists.newArrayList(); + for (int i = 0; i < numberOfContainers; i++) { + containers.add(createMockContainer(gpus, id + i)); + } + return containers; + } + + private Container createMockContainer(int gpus, long id) { + Resource res = createGpuResourceRequest(gpus); + ContainerId containerId = mock(ContainerId.class); + when(containerId.getContainerId()).thenReturn(id); + + Container container = mock(Container.class); + when(container.getResource()).thenReturn(res); + when(container.getContainerId()).thenReturn(containerId); + when(container.getContainerState()).thenReturn(ContainerState.RUNNING); + nmContext.getContainers().put(containerId, container); + + return container; + } + + private void createAndAddGpus(int numberOfGpus) { + for (int i = 0; i < numberOfGpus; i++) { + testSubject.addGpu(new GpuDevice(1, i)); + } + + assertEquals(0, testSubject.getDeviceAllocationMapping().size()); + assertEquals(0, testSubject.getAssignedGpus().size()); + assertEquals(numberOfGpus, testSubject.getAllowedGpus().size()); + assertEquals(numberOfGpus, testSubject.getAvailableGpus()); + } + + private void addGpus(GpuDevice... gpus) { + for (GpuDevice gpu : gpus) { + testSubject.addGpu(gpu); + } + assertEquals(0, testSubject.getDeviceAllocationMapping().size()); + assertEquals(0, testSubject.getAssignedGpus().size()); + assertEquals(gpus.length, testSubject.getAllowedGpus().size()); + assertEquals(gpus.length, testSubject.getAvailableGpus()); + } + + private void addGpusAndDontVerify(GpuDevice... gpus) { + for (GpuDevice gpu : gpus) { + testSubject.addGpu(gpu); + } + } + + private void setupContainerAsReleasingGpus(Container... releasingContainers) { + ContainerState[] finalStates = new ContainerState[] { + ContainerState.KILLING, ContainerState.DONE, + ContainerState.LOCALIZATION_FAILED, + ContainerState.CONTAINER_RESOURCES_CLEANINGUP, + ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, + ContainerState.EXITED_WITH_FAILURE, + ContainerState.EXITED_WITH_SUCCESS + }; + + final Random random = new Random(); + for (Container container : releasingContainers) { + ContainerState state = finalStates[random.nextInt(finalStates.length)]; + when(container.getContainerState()).thenReturn(state); + when(container.isContainerInFinalStates()).thenReturn(true); + } + } + + private void assertAllocatedGpu(GpuDevice expectedGpu, Container container, + GpuAllocation allocation) throws IOException { + assertEquals(1, allocation.getAllowedGPUs().size()); + assertEquals(0, allocation.getDeniedGPUs().size()); + + Set allowedGPUs = allocation.getAllowedGPUs(); + + GpuDevice allocatedGpu = allowedGPUs.iterator().next(); + assertEquals(expectedGpu, allocatedGpu); + assertAssignmentInStateStore(expectedGpu, container); + } + + private void assertAllocatedGpus(int gpus, int deniedGpus, + Container container, + GpuAllocation allocation) throws IOException { + assertEquals(gpus, allocation.getAllowedGPUs().size()); + assertEquals(deniedGpus, allocation.getDeniedGPUs().size()); + assertAssignmentInStateStore(gpus, container); + } + + private void assertNoAllocation(GpuAllocation allocation) { + assertEquals(1, allocation.getDeniedGPUs().size()); + assertEquals(0, allocation.getAllowedGPUs().size()); + verifyZeroInteractions(nmStateStore); + } + + private void assertAssignmentInStateStore(GpuDevice expectedGpu, + Container container) throws IOException { + verify(nmStateStore).storeAssignedResources( + argThat(new ContainerMatcher(container)), eq(GPU_URI), + gpuCaptor.capture()); + + List gpuList = gpuCaptor.getValue(); + assertEquals(1, gpuList.size()); + assertEquals(expectedGpu, gpuList.get(0)); + } + + private void assertAssignmentInStateStore(int gpus, + Container container) throws IOException { + verify(nmStateStore).storeAssignedResources( + argThat(new ContainerMatcher(container)), eq(GPU_URI), + gpuCaptor.capture()); + + List gpuList = gpuCaptor.getValue(); + assertEquals(gpus, gpuList.size()); + } + + private static Set findDuplicates( + List allocations) { + final Set result = new HashSet<>(); + final Set tmpSet = new HashSet<>(); + + for (GpuAllocation allocation : allocations) { + if (!tmpSet.add(allocation)) { + result.add(allocation); + } + } + return result; + } + + @Test + public void testNewGpuAllocatorHasEmptyCollectionOfDevices() { + assertEquals(0, testSubject.getDeviceAllocationMapping().size()); + assertEquals(0, testSubject.getAssignedGpus().size()); + assertEquals(0, testSubject.getAllowedGpus().size()); + assertEquals(0, testSubject.getAvailableGpus()); + } + + @Test + public void testAddOneDevice() { + addGpus(new GpuDevice(1, 1)); + assertEquals(0, testSubject.getDeviceAllocationMapping().size()); + assertEquals(0, testSubject.getAssignedGpus().size()); + } + + @Test + public void testAddMoreDevices() { + addGpus(new GpuDevice(1, 1), new GpuDevice(1, 2), new GpuDevice(1, 3)); + assertEquals(0, testSubject.getDeviceAllocationMapping().size()); + assertEquals(0, testSubject.getAssignedGpus().size()); + } + + @Test + public void testAddMoreDevicesWithSameData() { + addGpusAndDontVerify(new GpuDevice(1, 1), new GpuDevice(1, 1)); + assertEquals(0, testSubject.getDeviceAllocationMapping().size()); + assertEquals(0, testSubject.getAssignedGpus().size()); + assertEquals(1, testSubject.getAllowedGpus().size()); + assertEquals(1, testSubject.getAvailableGpus()); + } + + @Test + public void testRequestZeroGpu() throws ResourceHandlerException { + addGpus(new GpuDevice(1, 1)); + + Container container = createMockContainer(0, 5L); + GpuAllocation allocation = + testSubject.assignGpus(container); + + assertNoAllocation(allocation); + } + + @Test + public void testRequestOneGpu() throws ResourceHandlerException, IOException { + GpuDevice gpu = new GpuDevice(1, 1); + addGpus(gpu); + + Container container = createMockContainer(1, 5L); + GpuAllocation allocation = + testSubject.assignGpus(container); + + assertEquals(1, testSubject.getDeviceAllocationMapping().size()); + assertEquals(1, testSubject.getAssignedGpus().size()); + assertEquals(1, testSubject.getAllowedGpus().size()); + assertEquals(0, testSubject.getAvailableGpus()); + + assertAllocatedGpu(gpu, container, allocation); + } + + @Test + public void testRequestMoreThanAvailableGpu() + throws ResourceHandlerException { + addGpus(new GpuDevice(1, 1)); + Container container = createMockContainer(2, 5L); + + exception.expect(ResourceHandlerException.class); + exception.expectMessage("Failed to find enough GPUs"); + testSubject.assignGpus(container); + } + + @Test + public void testRequestMoreThanAvailableGpuAndOneContainerIsReleasingGpus() + throws ResourceHandlerException, IOException { + addGpus(new GpuDevice(1, 1), new GpuDevice(1, 2), new GpuDevice(1, 3)); + Container container = createMockContainer(2, 5L); + GpuAllocation allocation = testSubject.assignGpus(container); + assertAllocatedGpus(2, 1, container, allocation); + + assertEquals(2, testSubject.getDeviceAllocationMapping().size()); + assertEquals(2, testSubject.getAssignedGpus().size()); + assertEquals(3, testSubject.getAllowedGpus().size()); + assertEquals(1, testSubject.getAvailableGpus()); + + setupContainerAsReleasingGpus(container); + Container container2 = createMockContainer(2, 6L); + + exception.expect(ResourceHandlerException.class); + exception.expectMessage("as some other containers might not " + + "releasing GPUs"); + GpuAllocation allocation2 = testSubject.assignGpus(container2); + assertAllocatedGpus(2, 1, container, allocation2); + } + + @Test + public void testThreeContainersJustTwoOfThemSatisfied() + throws ResourceHandlerException, IOException { + addGpus(new GpuDevice(1, 1), new GpuDevice(1, 2), + new GpuDevice(1, 3), new GpuDevice(1, 4), + new GpuDevice(1, 5), new GpuDevice(1, 6)); + Container container = createMockContainer(3, 5L); + Container container2 = createMockContainer(2, 6L); + Container container3 = createMockContainer(2, 6L); + + GpuAllocation allocation = testSubject.assignGpus(container); + assertAllocatedGpus(3, 3, container, allocation); + assertEquals(3, testSubject.getDeviceAllocationMapping().size()); + assertEquals(3, testSubject.getAssignedGpus().size()); + assertEquals(6, testSubject.getAllowedGpus().size()); + assertEquals(3, testSubject.getAvailableGpus()); + + GpuAllocation allocation2 = testSubject.assignGpus(container2); + assertAllocatedGpus(2, 4, container2, allocation2); + assertEquals(5, testSubject.getDeviceAllocationMapping().size()); + assertEquals(5, testSubject.getAssignedGpus().size()); + assertEquals(6, testSubject.getAllowedGpus().size()); + assertEquals(1, testSubject.getAvailableGpus()); + + exception.expect(ResourceHandlerException.class); + exception.expectMessage("Failed to find enough GPUs"); + testSubject.assignGpus(container3); + } + + @Test + public void testReleaseAndAssignGpus() + throws ResourceHandlerException, IOException { + addGpus(new GpuDevice(1, 1), new GpuDevice(1, 2), new GpuDevice(1, 3)); + Container container = createMockContainer(2, 5L); + GpuAllocation allocation = testSubject.assignGpus(container); + assertAllocatedGpus(2, 1, container, allocation); + + assertEquals(2, testSubject.getDeviceAllocationMapping().size()); + assertEquals(2, testSubject.getAssignedGpus().size()); + assertEquals(3, testSubject.getAllowedGpus().size()); + assertEquals(1, testSubject.getAvailableGpus()); + + setupContainerAsReleasingGpus(container); + Container container2 = createMockContainer(2, 6L); + try { + testSubject.assignGpus(container2); + } catch (ResourceHandlerException e) { + //intended as we have not enough GPUs available + } + + assertEquals(2, testSubject.getDeviceAllocationMapping().size()); + assertEquals(2, testSubject.getAssignedGpus().size()); + assertEquals(3, testSubject.getAllowedGpus().size()); + assertEquals(1, testSubject.getAvailableGpus()); + + testSubject.unassignGpus(container.getContainerId()); + GpuAllocation allocation2 = testSubject.assignGpus(container2); + assertAllocatedGpus(2, 1, container, allocation2); + } + + @Test + public void testCreateLotsOfContainersVerifyGpuAssignmentsAreCorrect() + throws ResourceHandlerException, IOException { + createAndAddGpus(100); + + List containers = createMockContainers(3, 33); + List allocations = Lists.newArrayList(); + for (Container container : containers) { + GpuAllocation allocation = testSubject.assignGpus(container); + allocations.add(allocation); + assertAllocatedGpus(3, 97, container, allocation); + } + + assertEquals(99, testSubject.getDeviceAllocationMapping().size()); + assertEquals(99, testSubject.getAssignedGpus().size()); + assertEquals(100, testSubject.getAllowedGpus().size()); + assertEquals(1, testSubject.getAvailableGpus()); + + Set duplicateAllocations = findDuplicates(allocations); + assertEquals(0, duplicateAllocations.size()); + } + + @Test + public void testGpuGetsUnassignedWhenStateStoreThrowsException() + throws ResourceHandlerException, IOException { + doThrow(new IOException("Failed to save container mappings " + + "to NM state store!")) + .when(nmStateStore).storeAssignedResources(any(Container.class), + anyString(), anyList()); + + createAndAddGpus(1); + + exception.expect(ResourceHandlerException.class); + exception.expectMessage("Failed to save container mappings " + + "to NM state store"); + Container container = createMockContainer(1, 5L); + testSubject.assignGpus(container); + } +} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index 93af10aba5e..efd28eed357 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -157,7 +157,7 @@ public class TestGpuResourceHandler { gpuResourceHandler.bootstrap(conf); List allowedGpus = - gpuResourceHandler.getGpuAllocator().getAllowedGpusCopy(); + gpuResourceHandler.getGpuAllocator().getAllowedGpus(); assertEquals("Unexpected number of allowed GPU devices!", 1, allowedGpus.size()); assertEquals("Expected GPU device does not equal to found device!", @@ -496,7 +496,7 @@ public class TestGpuResourceHandler { gpuResourceHandler.reacquireContainer(getContainerId(1)); Map deviceAllocationMapping = - gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy(); + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); assertEquals("Unexpected number of allocated GPU devices!", 2, deviceAllocationMapping.size()); assertTrue("Expected GPU device is not found in allocations!", @@ -532,7 +532,7 @@ public class TestGpuResourceHandler { // Make sure internal state not changed. deviceAllocationMapping = - gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy(); + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); assertEquals("Unexpected number of allocated GPU devices!", 2, deviceAllocationMapping.size()); assertTrue("Expected GPU devices are not found in allocations!", @@ -567,7 +567,7 @@ public class TestGpuResourceHandler { // Make sure internal state not changed. deviceAllocationMapping = - gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy(); + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); assertEquals("Unexpected number of allocated GPU devices!", 2, deviceAllocationMapping.size()); assertTrue("Expected GPU devices are not found in allocations!",