From ada8f63d0b3739d245300461387b0516dc92ccf9 Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Tue, 26 Jun 2018 19:25:57 -0700 Subject: [PATCH] YARN-8423. GPU does not get released even though the application gets killed. (Sunil G via wangda) Change-Id: I570db7d60f8c6c21762dd618a9207d1107c486a0 --- .../containermanager/container/Container.java | 6 ++ .../container/ContainerImpl.java | 11 +++ .../resources/gpu/GpuResourceAllocator.java | 68 ++++++++++++++++++- .../resources/gpu/GpuResourceHandlerImpl.java | 1 - .../nodemanager/webapp/MockContainer.java | 3 + 5 files changed, 85 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index 5d48d8486b6..4912d02758d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -113,4 +113,10 @@ public interface Container extends EventHandler { ResourceMappings getResourceMappings(); void sendPauseEvent(String description); + + /** + * Verify container is in final states. + * @return true/false based on container's state + */ + boolean isContainerInFinalStates(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 0541544ab50..f76e682339d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -2223,4 +2223,15 @@ public class ContainerImpl implements Container { SlidingWindowRetryPolicy getRetryPolicy() { return retryPolicy; } + + @Override + public boolean isContainerInFinalStates() { + ContainerState state = getContainerState(); + return state == ContainerState.KILLING || state == ContainerState.DONE + || state == ContainerState.LOCALIZATION_FAILED + || state == ContainerState.CONTAINER_RESOURCES_CLEANINGUP + || state == ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL + || state == ContainerState.EXITED_WITH_FAILURE + || state == ContainerState.EXITED_WITH_SUCCESS; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java index 5bdffc369b2..81a965522ce 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java @@ -26,6 +26,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -36,10 +37,8 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -54,6 +53,7 @@ import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; */ public class GpuResourceAllocator { final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); + private static final int WAIT_MS_PER_LOOP = 1000; private Set allowedGpuDevices = new TreeSet<>(); private Map usedDevices = new TreeMap<>(); @@ -168,13 +168,58 @@ public class GpuResourceAllocator { * @return allocation results. * @throws ResourceHandlerException When failed to assign GPUs. */ - public synchronized GpuAllocation assignGpus(Container container) + public GpuAllocation assignGpus(Container container) + throws ResourceHandlerException { + GpuAllocation allocation = internalAssignGpus(container); + + // Wait for a maximum of 120 seconds if no available GPU are there which + // are yet to be released. + final int timeoutMsecs = 120 * WAIT_MS_PER_LOOP; + int timeWaiting = 0; + while (allocation == null) { + if (timeWaiting >= timeoutMsecs) { + break; + } + + // Sleep for 1 sec to ensure there are some free GPU devices which are + // getting released. + try { + LOG.info("Container : " + container.getContainerId() + + " is waiting for free GPU devices."); + Thread.sleep(WAIT_MS_PER_LOOP); + timeWaiting += WAIT_MS_PER_LOOP; + allocation = internalAssignGpus(container); + } catch (InterruptedException e) { + // On any interrupt, break the loop and continue execution. + break; + } + } + + if(allocation == null) { + String message = "Could not get valid GPU device for container '" + + container.getContainerId() + + "' as some other containers might not releasing GPUs."; + LOG.warn(message); + throw new ResourceHandlerException(message); + } + return allocation; + } + + private synchronized GpuAllocation internalAssignGpus(Container container) throws ResourceHandlerException { Resource requestedResource = container.getResource(); ContainerId containerId = container.getContainerId(); int numRequestedGpuDevices = getRequestedGpus(requestedResource); // Assign Gpus to container if requested some. if (numRequestedGpuDevices > 0) { + if (numRequestedGpuDevices > getAvailableGpus()) { + // If there are some devices which are getting released, wait for few + // seconds to get it. + if (numRequestedGpuDevices <= getReleasingGpus() + getAvailableGpus()) { + return null; + } + } + if (numRequestedGpuDevices > getAvailableGpus()) { throw new ResourceHandlerException( getResourceHandlerExceptionMessage(numRequestedGpuDevices, @@ -211,6 +256,23 @@ public class GpuResourceAllocator { return new GpuAllocation(null, allowedGpuDevices); } + private synchronized long getReleasingGpus() { + long releasingGpus = 0; + Iterator> iter = usedDevices.entrySet() + .iterator(); + while (iter.hasNext()) { + ContainerId containerId = iter.next().getValue(); + Container container; + if ((container = nmContext.getContainers().get(containerId)) != null) { + if (container.isContainerInFinalStates()) { + releasingGpus = releasingGpus + container.getResource() + .getResourceInformation(ResourceInformation.GPU_URI).getValue(); + } + } + } + return releasingGpus; + } + /** * Clean up all Gpus assigned to containerId * @param containerId containerId diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index 587fcb4983a..118438296b1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -18,7 +18,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; -import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 325709b07ae..67dfef259df 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -255,4 +255,7 @@ public class MockContainer implements Container { public void sendPauseEvent(String description) { } + @Override public boolean isContainerInFinalStates() { + return false; + } }