From 881084fe5c3118c1f62585aa1b72262d46d74ac6 Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Thu, 26 Feb 2015 17:05:25 -0800 Subject: [PATCH] YARN-3251. Fixed a deadlock in CapacityScheduler when computing absoluteMaxAvailableCapacity in LeafQueue (Craig Welch via wangda) --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../scheduler/capacity/LeafQueue.java | 24 +++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 3eb6fbc84e5..c603c50f4a1 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -619,6 +619,9 @@ Release 2.6.0 - 2014-11-18 identifiers to be tampered and thus causing app submission failures in secure mode. (Jian He via vinodkv) + YARN-3251. Fixed a deadlock in CapacityScheduler when computing + absoluteMaxAvailableCapacity in LeafQueue (Craig Welch via wangda) + BREAKDOWN OF YARN-1051 SUBTASKS AND RELATED JIRAS YARN-1707. Introduce APIs to add/remove/resize queues in the diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index ffeec630858..eddf30fff1c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -115,6 +115,8 @@ public class LeafQueue extends AbstractCSQueue { private final QueueHeadroomInfo queueHeadroomInfo = new QueueHeadroomInfo(); + private volatile float absoluteMaxAvailCapacity; + public LeafQueue(CapacitySchedulerContext cs, String queueName, CSQueue parent, CSQueue old) throws IOException { super(cs, queueName, parent, old); @@ -133,6 +135,10 @@ public class LeafQueue extends AbstractCSQueue { (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100; float absoluteMaxCapacity = CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent); + + // Initially set to absoluteMax, will be updated to more accurate + // max avail value during assignContainers + absoluteMaxAvailCapacity = absoluteMaxCapacity; int userLimit = cs.getConfiguration().getUserLimit(getQueuePath()); float userLimitFactor = @@ -720,8 +726,18 @@ public class LeafQueue extends AbstractCSQueue { } @Override - public synchronized CSAssignment assignContainers(Resource clusterResource, + public CSAssignment assignContainers(Resource clusterResource, FiCaSchedulerNode node, boolean needToUnreserve) { + //We should not hold a lock on a queue and its parent concurrently - it + //can lead to deadlocks when calls which walk down the tree occur + //concurrently (getQueueInfo...) + absoluteMaxAvailCapacity = CSQueueUtils.getAbsoluteMaxAvailCapacity( + resourceCalculator, clusterResource, this); + return assignContainersInternal(clusterResource, node, needToUnreserve); + } + + private synchronized CSAssignment assignContainersInternal( + Resource clusterResource, FiCaSchedulerNode node, boolean needToUnreserve) { if(LOG.isDebugEnabled()) { LOG.debug("assignContainers: node=" + node.getNodeName() @@ -1012,12 +1028,6 @@ public class LeafQueue extends AbstractCSQueue { computeUserLimit(application, clusterResource, required, queueUser, requestedLabels); - //Max avail capacity needs to take into account usage by ancestor-siblings - //which are greater than their base capacity, so we are interested in "max avail" - //capacity - float absoluteMaxAvailCapacity = CSQueueUtils.getAbsoluteMaxAvailCapacity( - resourceCalculator, clusterResource, this); - Resource queueMaxCap = // Queue Max-Capacity Resources.multiplyAndNormalizeDown( resourceCalculator,