From 772ead791c17b0b7415cce0934366113cdbe9379 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Fri, 21 Feb 2014 02:20:20 +0000 Subject: [PATCH] YARN-1398. Fixed a deadlock in ResourceManager between users requesting queue-acls and completing containers. Contributed by Vinod Kumar Vavilapalli. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1570415 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../scheduler/capacity/LeafQueue.java | 14 ++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index d60ceb467b7..dc619714287 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -321,6 +321,9 @@ Release 2.4.0 - UNRELEASED YARN-713. Fixed ResourceManager to not crash while building tokens when DNS issues happen transmittently. (Jian He via vinodkv) + YARN-1398. Fixed a deadlock in ResourceManager between users requesting + queue-acls and completing containers. (vinodkv) + Release 2.3.1 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 968d373e257..5958eec6473 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -50,7 +50,6 @@ import org.apache.hadoop.yarn.api.records.QueueUserACLInfo; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.ResourceRequest; -import org.apache.hadoop.yarn.api.records.Token; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; @@ -1410,12 +1409,14 @@ public void completedContainer(Resource clusterResource, FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer, ContainerStatus containerStatus, RMContainerEventType event, CSQueue childQueue) { if (application != null) { + + boolean removed = false; + // Careful! Locking order is important! synchronized (this) { Container container = rmContainer.getContainer(); - boolean removed = false; // Inform the application & the node // Note: It's safe to assume that all state changes to RMContainer // happen under scheduler's lock... @@ -1441,13 +1442,14 @@ public void completedContainer(Resource clusterResource, " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used=" + usedResources + " cluster=" + clusterResource); - // Inform the parent queue - getParent().completedContainer(clusterResource, application, - node, rmContainer, null, event, this); } } - + if (removed) { + // Inform the parent queue _outside_ of the leaf-queue lock + getParent().completedContainer(clusterResource, application, node, + rmContainer, null, event, this); + } } }