YARN-4519. Potential deadlock of CapacityScheduler between decrease container and assign containers. Contributed by Meng Ding

This commit is contained in:
Jian He 2016-01-27 15:38:32 -08:00
parent bd01fdb64d
commit 36f0e73882
10 changed files with 320 additions and 178 deletions

View File

@ -112,6 +112,9 @@ Release 2.9.0 - UNRELEASED
YARN-4633. Fix random test failure in TestRMRestart#testRMRestartAfterPreemption YARN-4633. Fix random test failure in TestRMRestart#testRMRestartAfterPreemption
(Bibin A Chundatt via rohithsharmaks) (Bibin A Chundatt via rohithsharmaks)
YARN-4519. Potential deadlock of CapacityScheduler between decrease container
and assign containers. (Meng Ding via jianhe)
Release 2.8.0 - UNRELEASED Release 2.8.0 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -53,9 +53,10 @@ import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler
.SchedContainerChangeRequest;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils;
@ -116,41 +117,23 @@ public class RMServerUtils {
} }
/** /**
* Normalize container increase/decrease request, it will normalize and update * Validate increase/decrease request. This function must be called under
* ContainerResourceChangeRequest.targetResource * the queue lock to make sure that the access to container resource is
* atomic. Refer to LeafQueue.decreaseContainer() and
* CapacityScheduelr.updateIncreaseRequests()
*
* *
* <pre> * <pre>
* - Throw exception when any other error happens * - Throw exception when any other error happens
* </pre> * </pre>
*/ */
public static void checkAndNormalizeContainerChangeRequest( public static void checkSchedContainerChangeRequest(
RMContext rmContext, ContainerResourceChangeRequest request, SchedContainerChangeRequest request, boolean increase)
boolean increase) throws InvalidResourceRequestException { throws InvalidResourceRequestException {
RMContext rmContext = request.getRmContext();
ContainerId containerId = request.getContainerId(); ContainerId containerId = request.getContainerId();
ResourceScheduler scheduler = rmContext.getScheduler(); RMContainer rmContainer = request.getRMContainer();
RMContainer rmContainer = scheduler.getRMContainer(containerId); Resource targetResource = request.getTargetCapacity();
ResourceCalculator rc = scheduler.getResourceCalculator();
if (null == rmContainer) {
String msg =
"Failed to get rmContainer for "
+ (increase ? "increase" : "decrease")
+ " request, with container-id=" + containerId;
throw new InvalidResourceRequestException(msg);
}
if (rmContainer.getState() != RMContainerState.RUNNING) {
String msg =
"rmContainer's state is not RUNNING, for "
+ (increase ? "increase" : "decrease")
+ " request, with container-id=" + containerId;
throw new InvalidResourceRequestException(msg);
}
Resource targetResource = Resources.normalize(rc, request.getCapability(),
scheduler.getMinimumResourceCapability(),
scheduler.getMaximumResourceCapability(),
scheduler.getMinimumResourceCapability());
// Compare targetResource and original resource // Compare targetResource and original resource
Resource originalResource = rmContainer.getAllocatedResource(); Resource originalResource = rmContainer.getAllocatedResource();
@ -182,9 +165,9 @@ public class RMServerUtils {
} }
} }
RMNode rmNode = rmContext.getRMNodes().get(rmContainer.getAllocatedNode());
// Target resource of the increase request is more than NM can offer // Target resource of the increase request is more than NM can offer
ResourceScheduler scheduler = rmContext.getScheduler();
RMNode rmNode = request.getSchedulerNode().getRMNode();
if (!Resources.fitsIn(scheduler.getResourceCalculator(), if (!Resources.fitsIn(scheduler.getResourceCalculator(),
scheduler.getClusterResource(), targetResource, scheduler.getClusterResource(), targetResource,
rmNode.getTotalCapability())) { rmNode.getTotalCapability())) {
@ -193,9 +176,6 @@ public class RMServerUtils {
+ rmNode.getTotalCapability(); + rmNode.getTotalCapability();
throw new InvalidResourceRequestException(msg); throw new InvalidResourceRequestException(msg);
} }
// Update normalized target resource
request.setCapability(targetResource);
} }
/* /*
@ -254,6 +234,7 @@ public class RMServerUtils {
} }
} }
// Sanity check and normalize target resource
private static void validateIncreaseDecreaseRequest(RMContext rmContext, private static void validateIncreaseDecreaseRequest(RMContext rmContext,
List<ContainerResourceChangeRequest> requests, Resource maximumAllocation, List<ContainerResourceChangeRequest> requests, Resource maximumAllocation,
boolean increase) boolean increase)
@ -283,8 +264,23 @@ public class RMServerUtils {
+ request.getCapability().getVirtualCores() + ", maxVirtualCores=" + request.getCapability().getVirtualCores() + ", maxVirtualCores="
+ maximumAllocation.getVirtualCores()); + maximumAllocation.getVirtualCores());
} }
ContainerId containerId = request.getContainerId();
checkAndNormalizeContainerChangeRequest(rmContext, request, increase); ResourceScheduler scheduler = rmContext.getScheduler();
RMContainer rmContainer = scheduler.getRMContainer(containerId);
if (null == rmContainer) {
String msg =
"Failed to get rmContainer for "
+ (increase ? "increase" : "decrease")
+ " request, with container-id=" + containerId;
throw new InvalidResourceRequestException(msg);
}
ResourceCalculator rc = scheduler.getResourceCalculator();
Resource targetResource = Resources.normalize(rc, request.getCapability(),
scheduler.getMinimumResourceCapability(),
scheduler.getMaximumResourceCapability(),
scheduler.getMinimumResourceCapability());
// Update normalized target resource
request.setCapability(targetResource);
} }
} }

View File

@ -22,7 +22,6 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -55,13 +54,13 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceOption; import org.apache.hadoop.yarn.api.records.ResourceOption;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes; import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
@ -74,6 +73,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEven
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeDecreaseContainerEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeDecreaseContainerEvent;
@ -620,26 +620,18 @@ public abstract class AbstractYarnScheduler
} }
protected void decreaseContainers( protected void decreaseContainers(
List<SchedContainerChangeRequest> decreaseRequests, List<ContainerResourceChangeRequest> decreaseRequests,
SchedulerApplicationAttempt attempt) { SchedulerApplicationAttempt attempt) {
for (SchedContainerChangeRequest request : decreaseRequests) { if (null == decreaseRequests || decreaseRequests.isEmpty()) {
return;
}
// Pre-process decrease requests
List<SchedContainerChangeRequest> schedDecreaseRequests =
createSchedContainerChangeRequests(decreaseRequests, false);
for (SchedContainerChangeRequest request : schedDecreaseRequests) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Processing decrease request:" + request); LOG.debug("Processing decrease request:" + request);
} }
boolean hasIncreaseRequest =
attempt.removeIncreaseRequest(request.getNodeId(),
request.getPriority(), request.getContainerId());
if (hasIncreaseRequest) {
if (LOG.isDebugEnabled()) {
LOG.debug("While processing decrease request, found a increase request "
+ "for the same container "
+ request.getContainerId()
+ ", removed the increase request");
}
}
// handle decrease request // handle decrease request
decreaseContainer(request, attempt); decreaseContainer(request, attempt);
} }
@ -877,7 +869,7 @@ public abstract class AbstractYarnScheduler
} }
/** /**
* Normalize container increase/decrease request, and return * Sanity check increase/decrease request, and return
* SchedulerContainerResourceChangeRequest according to given * SchedulerContainerResourceChangeRequest according to given
* ContainerResourceChangeRequest. * ContainerResourceChangeRequest.
* *
@ -886,37 +878,34 @@ public abstract class AbstractYarnScheduler
* - Throw exception when any other error happens * - Throw exception when any other error happens
* </pre> * </pre>
*/ */
private SchedContainerChangeRequest private SchedContainerChangeRequest createSchedContainerChangeRequest(
checkAndNormalizeContainerChangeRequest( ContainerResourceChangeRequest request, boolean increase)
ContainerResourceChangeRequest request, boolean increase) throws YarnException {
throws YarnException {
// We have done a check in ApplicationMasterService, but RMContainer status
// / Node resource could change since AMS won't acquire lock of scheduler.
RMServerUtils.checkAndNormalizeContainerChangeRequest(rmContext, request,
increase);
ContainerId containerId = request.getContainerId(); ContainerId containerId = request.getContainerId();
RMContainer rmContainer = getRMContainer(containerId); RMContainer rmContainer = getRMContainer(containerId);
if (null == rmContainer) {
String msg =
"Failed to get rmContainer for "
+ (increase ? "increase" : "decrease")
+ " request, with container-id=" + containerId;
throw new InvalidResourceRequestException(msg);
}
SchedulerNode schedulerNode = SchedulerNode schedulerNode =
getSchedulerNode(rmContainer.getAllocatedNode()); getSchedulerNode(rmContainer.getAllocatedNode());
return new SchedContainerChangeRequest(
return new SchedContainerChangeRequest(schedulerNode, rmContainer, this.rmContext, schedulerNode, rmContainer, request.getCapability());
request.getCapability());
} }
protected List<SchedContainerChangeRequest> protected List<SchedContainerChangeRequest>
checkAndNormalizeContainerChangeRequests( createSchedContainerChangeRequests(
List<ContainerResourceChangeRequest> changeRequests, List<ContainerResourceChangeRequest> changeRequests,
boolean increase) { boolean increase) {
if (null == changeRequests || changeRequests.isEmpty()) {
return Collections.EMPTY_LIST;
}
List<SchedContainerChangeRequest> schedulerChangeRequests = List<SchedContainerChangeRequest> schedulerChangeRequests =
new ArrayList<SchedContainerChangeRequest>(); new ArrayList<SchedContainerChangeRequest>();
for (ContainerResourceChangeRequest r : changeRequests) { for (ContainerResourceChangeRequest r : changeRequests) {
SchedContainerChangeRequest sr = null; SchedContainerChangeRequest sr = null;
try { try {
sr = checkAndNormalizeContainerChangeRequest(r, increase); sr = createSchedContainerChangeRequest(r, increase);
} catch (YarnException e) { } catch (YarnException e) {
LOG.warn("Error happens when checking increase request, Ignoring.." LOG.warn("Error happens when checking increase request, Ignoring.."
+ " exception=", e); + " exception=", e);
@ -924,7 +913,6 @@ public abstract class AbstractYarnScheduler
} }
schedulerChangeRequests.add(sr); schedulerChangeRequests.add(sr);
} }
return schedulerChangeRequests; return schedulerChangeRequests;
} }
} }

View File

@ -44,6 +44,8 @@ import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
@ -148,6 +150,18 @@ public class AppSchedulingInfo {
boolean resourceUpdated = false; boolean resourceUpdated = false;
for (SchedContainerChangeRequest r : increaseRequests) { for (SchedContainerChangeRequest r : increaseRequests) {
if (r.getRMContainer().getState() != RMContainerState.RUNNING) {
LOG.warn("rmContainer's state is not RUNNING, for increase request with"
+ " container-id=" + r.getContainerId());
continue;
}
try {
RMServerUtils.checkSchedContainerChangeRequest(r, true);
} catch (YarnException e) {
LOG.warn("Error happens when checking increase request, Ignoring.."
+ " exception=", e);
continue;
}
NodeId nodeId = r.getRMContainer().getAllocatedNode(); NodeId nodeId = r.getRMContainer().getAllocatedNode();
Map<Priority, Map<ContainerId, SchedContainerChangeRequest>> requestsOnNode = Map<Priority, Map<ContainerId, SchedContainerChangeRequest>> requestsOnNode =
@ -221,7 +235,7 @@ public class AppSchedulingInfo {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Added increase request:" + request.getContainerId() LOG.debug("Added increase request:" + request.getContainerId()
+ " delta=" + request.getDeltaCapacity()); + " delta=" + delta);
} }
// update priorities // update priorities
@ -520,24 +534,20 @@ public class AppSchedulingInfo {
NodeId nodeId = increaseRequest.getNodeId(); NodeId nodeId = increaseRequest.getNodeId();
Priority priority = increaseRequest.getPriority(); Priority priority = increaseRequest.getPriority();
ContainerId containerId = increaseRequest.getContainerId(); ContainerId containerId = increaseRequest.getContainerId();
Resource deltaCapacity = increaseRequest.getDeltaCapacity();
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("allocated increase request : applicationId=" + applicationId LOG.debug("allocated increase request : applicationId=" + applicationId
+ " container=" + containerId + " host=" + " container=" + containerId + " host="
+ increaseRequest.getNodeId() + " user=" + user + " resource=" + increaseRequest.getNodeId() + " user=" + user + " resource="
+ increaseRequest.getDeltaCapacity()); + deltaCapacity);
} }
// Set queue metrics // Set queue metrics
queue.getMetrics().allocateResources(user, queue.getMetrics().allocateResources(user, deltaCapacity);
increaseRequest.getDeltaCapacity());
// remove the increase request from pending increase request map // remove the increase request from pending increase request map
removeIncreaseRequest(nodeId, priority, containerId); removeIncreaseRequest(nodeId, priority, containerId);
// update usage // update usage
appResourceUsage.incUsed(increaseRequest.getNodePartition(), appResourceUsage.incUsed(increaseRequest.getNodePartition(), deltaCapacity);
increaseRequest.getDeltaCapacity());
} }
public synchronized void decreaseContainer( public synchronized void decreaseContainer(

View File

@ -22,6 +22,7 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
@ -32,18 +33,19 @@ import org.apache.hadoop.yarn.util.resource.Resources;
*/ */
public class SchedContainerChangeRequest implements public class SchedContainerChangeRequest implements
Comparable<SchedContainerChangeRequest> { Comparable<SchedContainerChangeRequest> {
RMContainer rmContainer; private RMContext rmContext;
Resource targetCapacity; private RMContainer rmContainer;
SchedulerNode schedulerNode; private Resource targetCapacity;
Resource deltaCapacity; private SchedulerNode schedulerNode;
private Resource deltaCapacity;
public SchedContainerChangeRequest(SchedulerNode schedulerNode, public SchedContainerChangeRequest(
RMContext rmContext, SchedulerNode schedulerNode,
RMContainer rmContainer, Resource targetCapacity) { RMContainer rmContainer, Resource targetCapacity) {
this.rmContext = rmContext;
this.rmContainer = rmContainer; this.rmContainer = rmContainer;
this.targetCapacity = targetCapacity; this.targetCapacity = targetCapacity;
this.schedulerNode = schedulerNode; this.schedulerNode = schedulerNode;
deltaCapacity = Resources.subtract(targetCapacity,
rmContainer.getAllocatedResource());
} }
public NodeId getNodeId() { public NodeId getNodeId() {
@ -58,11 +60,19 @@ public class SchedContainerChangeRequest implements
return this.targetCapacity; return this.targetCapacity;
} }
public RMContext getRmContext() {
return this.rmContext;
}
/** /**
* Delta capacity = before - target, so if it is a decrease request, delta * Delta capacity = target - before, so if it is a decrease request, delta
* capacity will be negative * capacity will be negative
*/ */
public Resource getDeltaCapacity() { public synchronized Resource getDeltaCapacity() {
// Only calculate deltaCapacity once
if (deltaCapacity == null) {
deltaCapacity = Resources.subtract(
targetCapacity, rmContainer.getAllocatedResource());
}
return deltaCapacity; return deltaCapacity;
} }
@ -112,7 +122,6 @@ public class SchedContainerChangeRequest implements
@Override @Override
public String toString() { public String toString() {
return "<container=" + getContainerId() + ", targetCapacity=" return "<container=" + getContainerId() + ", targetCapacity="
+ targetCapacity + ", delta=" + deltaCapacity + ", node=" + targetCapacity + ", node=" + getNodeId().toString() + ">";
+ getNodeId().toString() + ">";
} }
} }

View File

@ -33,6 +33,7 @@ import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.QueueACL; import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.yarn.api.records.QueueState;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
import org.apache.hadoop.yarn.security.PrivilegedEntity; import org.apache.hadoop.yarn.security.PrivilegedEntity;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
@ -332,7 +333,7 @@ extends org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue {
*/ */
public void decreaseContainer(Resource clusterResource, public void decreaseContainer(Resource clusterResource,
SchedContainerChangeRequest decreaseRequest, SchedContainerChangeRequest decreaseRequest,
FiCaSchedulerApp app); FiCaSchedulerApp app) throws InvalidResourceRequestException;
/** /**
* Get valid Node Labels for this queue * Get valid Node Labels for this queue

View File

@ -21,8 +21,8 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashMap; import java.util.HashMap;
@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceOption; import org.apache.hadoop.yarn.api.records.ResourceOption;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes; import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
@ -895,9 +896,36 @@ public class CapacityScheduler extends
} }
} }
// It is crucial to acquire leaf queue lock first to prevent:
// 1. Race condition when calculating the delta resource in
// SchedContainerChangeRequest
// 2. Deadlock with the scheduling thread.
private LeafQueue updateIncreaseRequests(
List<ContainerResourceChangeRequest> increaseRequests,
FiCaSchedulerApp app) {
if (null == increaseRequests || increaseRequests.isEmpty()) {
return null;
}
// Pre-process increase requests
List<SchedContainerChangeRequest> schedIncreaseRequests =
createSchedContainerChangeRequests(increaseRequests, true);
LeafQueue leafQueue = (LeafQueue) app.getQueue();
synchronized(leafQueue) {
// make sure we aren't stopping/removing the application
// when the allocate comes in
if (app.isStopped()) {
return null;
}
// Process increase resource requests
if (app.updateIncreaseRequests(schedIncreaseRequests)) {
return leafQueue;
}
return null;
}
}
@Override @Override
// Note: when AM asks to decrease container or release container, we will // Note: when AM asks to release container, we will acquire scheduler lock
// acquire scheduler lock
@Lock(Lock.NoLock.class) @Lock(Lock.NoLock.class)
public Allocation allocate(ApplicationAttemptId applicationAttemptId, public Allocation allocate(ApplicationAttemptId applicationAttemptId,
List<ResourceRequest> ask, List<ContainerId> release, List<ResourceRequest> ask, List<ContainerId> release,
@ -910,26 +938,23 @@ public class CapacityScheduler extends
return EMPTY_ALLOCATION; return EMPTY_ALLOCATION;
} }
// Sanity check // Release containers
releaseContainers(release, application);
// update increase requests
LeafQueue updateDemandForQueue =
updateIncreaseRequests(increaseRequests, application);
// Decrease containers
decreaseContainers(decreaseRequests, application);
// Sanity check for new allocation requests
SchedulerUtils.normalizeRequests( SchedulerUtils.normalizeRequests(
ask, getResourceCalculator(), getClusterResource(), ask, getResourceCalculator(), getClusterResource(),
getMinimumResourceCapability(), getMaximumResourceCapability()); getMinimumResourceCapability(), getMaximumResourceCapability());
// Pre-process increase requests
List<SchedContainerChangeRequest> normalizedIncreaseRequests =
checkAndNormalizeContainerChangeRequests(increaseRequests, true);
// Pre-process decrease requests
List<SchedContainerChangeRequest> normalizedDecreaseRequests =
checkAndNormalizeContainerChangeRequests(decreaseRequests, false);
// Release containers
releaseContainers(release, application);
Allocation allocation; Allocation allocation;
LeafQueue updateDemandForQueue = null;
synchronized (application) { synchronized (application) {
// make sure we aren't stopping/removing the application // make sure we aren't stopping/removing the application
@ -947,7 +972,8 @@ public class CapacityScheduler extends
} }
// Update application requests // Update application requests
if (application.updateResourceRequests(ask)) { if (application.updateResourceRequests(ask)
&& (updateDemandForQueue == null)) {
updateDemandForQueue = (LeafQueue) application.getQueue(); updateDemandForQueue = (LeafQueue) application.getQueue();
} }
@ -957,12 +983,6 @@ public class CapacityScheduler extends
} }
} }
// Process increase resource requests
if (application.updateIncreaseRequests(normalizedIncreaseRequests)
&& (updateDemandForQueue == null)) {
updateDemandForQueue = (LeafQueue) application.getQueue();
}
if (application.isWaitingForAMContainer()) { if (application.isWaitingForAMContainer()) {
// Allocate is for AM and update AM blacklist for this // Allocate is for AM and update AM blacklist for this
application.updateAMBlacklist( application.updateAMBlacklist(
@ -971,8 +991,6 @@ public class CapacityScheduler extends
application.updateBlacklist(blacklistAdditions, blacklistRemovals); application.updateBlacklist(blacklistAdditions, blacklistRemovals);
} }
// Decrease containers
decreaseContainers(normalizedDecreaseRequests, application);
allocation = application.getAllocation(getResourceCalculator(), allocation = application.getAllocation(getResourceCalculator(),
clusterResource, getMinimumResourceCapability()); clusterResource, getMinimumResourceCapability());
@ -1167,7 +1185,8 @@ public class CapacityScheduler extends
.getAssignmentInformation().getReserved()); .getAssignmentInformation().getReserved());
} }
private synchronized void allocateContainersToNode(FiCaSchedulerNode node) { @VisibleForTesting
protected synchronized void allocateContainersToNode(FiCaSchedulerNode node) {
if (rmContext.isWorkPreservingRecoveryEnabled() if (rmContext.isWorkPreservingRecoveryEnabled()
&& !rmContext.isSchedulerReadyForAllocatingContainers()) { && !rmContext.isSchedulerReadyForAllocatingContainers()) {
return; return;
@ -1517,48 +1536,30 @@ public class CapacityScheduler extends
} }
} }
@Lock(CapacityScheduler.class)
@Override @Override
protected synchronized void decreaseContainer( protected void decreaseContainer(SchedContainerChangeRequest decreaseRequest,
SchedContainerChangeRequest decreaseRequest,
SchedulerApplicationAttempt attempt) { SchedulerApplicationAttempt attempt) {
RMContainer rmContainer = decreaseRequest.getRMContainer(); RMContainer rmContainer = decreaseRequest.getRMContainer();
// Check container status before doing decrease // Check container status before doing decrease
if (rmContainer.getState() != RMContainerState.RUNNING) { if (rmContainer.getState() != RMContainerState.RUNNING) {
LOG.info("Trying to decrease a container not in RUNNING state, container=" LOG.info("Trying to decrease a container not in RUNNING state, container="
+ rmContainer + " state=" + rmContainer.getState().name()); + rmContainer + " state=" + rmContainer.getState().name());
return; return;
} }
// Delta capacity of this decrease request is 0, this decrease request may
// just to cancel increase request
if (Resources.equals(decreaseRequest.getDeltaCapacity(), Resources.none())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Decrease target resource equals to existing resource for container:"
+ decreaseRequest.getContainerId()
+ " ignore this decrease request.");
}
return;
}
// Save resource before decrease
Resource resourceBeforeDecrease =
Resources.clone(rmContainer.getContainer().getResource());
FiCaSchedulerApp app = (FiCaSchedulerApp)attempt; FiCaSchedulerApp app = (FiCaSchedulerApp)attempt;
LeafQueue queue = (LeafQueue) attempt.getQueue(); LeafQueue queue = (LeafQueue) attempt.getQueue();
queue.decreaseContainer(clusterResource, decreaseRequest, app); try {
queue.decreaseContainer(clusterResource, decreaseRequest, app);
// Notify RMNode the container will be decreased // Notify RMNode that the container can be pulled by NodeManager in the
this.rmContext.getDispatcher().getEventHandler() // next heartbeat
.handle(new RMNodeDecreaseContainerEvent(decreaseRequest.getNodeId(), this.rmContext.getDispatcher().getEventHandler()
Arrays.asList(rmContainer.getContainer()))); .handle(new RMNodeDecreaseContainerEvent(
decreaseRequest.getNodeId(),
LOG.info("Application attempt " + app.getApplicationAttemptId() Collections.singletonList(rmContainer.getContainer())));
+ " decreased container:" + decreaseRequest.getContainerId() + " from " } catch (InvalidResourceRequestException e) {
+ resourceBeforeDecrease + " to " LOG.warn("Error happens when checking decrease request, Ignoring.."
+ decreaseRequest.getTargetCapacity()); + " exception=", e);
}
} }
@Lock(Lock.NoLock.class) @Lock(Lock.NoLock.class)

View File

@ -47,10 +47,12 @@ import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.yarn.api.records.QueueState;
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo; import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager; import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
import org.apache.hadoop.yarn.security.AccessType; import org.apache.hadoop.yarn.security.AccessType;
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
@ -1677,10 +1679,16 @@ public class LeafQueue extends AbstractCSQueue {
return defaultAppPriorityPerQueue; return defaultAppPriorityPerQueue;
} }
/**
*
* @param clusterResource Total cluster resource
* @param decreaseRequest The decrease request
* @param app The application of interest
*/
@Override @Override
public void decreaseContainer(Resource clusterResource, public void decreaseContainer(Resource clusterResource,
SchedContainerChangeRequest decreaseRequest, SchedContainerChangeRequest decreaseRequest,
FiCaSchedulerApp app) { FiCaSchedulerApp app) throws InvalidResourceRequestException {
// If the container being decreased is reserved, we need to unreserve it // If the container being decreased is reserved, we need to unreserve it
// first. // first.
RMContainer rmContainer = decreaseRequest.getRMContainer(); RMContainer rmContainer = decreaseRequest.getRMContainer();
@ -1688,25 +1696,62 @@ public class LeafQueue extends AbstractCSQueue {
unreserveIncreasedContainer(clusterResource, app, unreserveIncreasedContainer(clusterResource, app,
(FiCaSchedulerNode)decreaseRequest.getSchedulerNode(), rmContainer); (FiCaSchedulerNode)decreaseRequest.getSchedulerNode(), rmContainer);
} }
boolean resourceDecreased = false;
// Delta capacity is negative when it's a decrease request Resource resourceBeforeDecrease;
Resource absDelta = Resources.negate(decreaseRequest.getDeltaCapacity()); // Grab queue lock to avoid race condition when getting container resource
synchronized (this) { synchronized (this) {
// Delta is negative when it's a decrease request // Make sure the decrease request is valid in terms of current resource
releaseResource(clusterResource, app, absDelta, // and target resource. This must be done under the leaf queue lock.
decreaseRequest.getNodePartition(), decreaseRequest.getRMContainer(), // Throws exception if the check fails.
true); RMServerUtils.checkSchedContainerChangeRequest(decreaseRequest, false);
// Notify application // Save resource before decrease for debug log
app.decreaseContainer(decreaseRequest); resourceBeforeDecrease =
// Notify node Resources.clone(rmContainer.getAllocatedResource());
decreaseRequest.getSchedulerNode() // Do we have increase request for the same container? If so, remove it
.decreaseContainer(decreaseRequest.getContainerId(), absDelta); boolean hasIncreaseRequest =
app.removeIncreaseRequest(decreaseRequest.getNodeId(),
decreaseRequest.getPriority(), decreaseRequest.getContainerId());
if (hasIncreaseRequest) {
if (LOG.isDebugEnabled()) {
LOG.debug("While processing decrease requests, found an increase"
+ " request for the same container "
+ decreaseRequest.getContainerId()
+ ", removed the increase request");
}
}
// Delta capacity is negative when it's a decrease request
Resource absDelta = Resources.negate(decreaseRequest.getDeltaCapacity());
if (Resources.equals(absDelta, Resources.none())) {
// If delta capacity of this decrease request is 0, this decrease
// request serves the purpose of cancelling an existing increase request
// if any
if (LOG.isDebugEnabled()) {
LOG.debug("Decrease target resource equals to existing resource for"
+ " container:" + decreaseRequest.getContainerId()
+ " ignore this decrease request.");
}
} else {
// Release the delta resource
releaseResource(clusterResource, app, absDelta,
decreaseRequest.getNodePartition(),
decreaseRequest.getRMContainer(),
true);
// Notify application
app.decreaseContainer(decreaseRequest);
// Notify node
decreaseRequest.getSchedulerNode()
.decreaseContainer(decreaseRequest.getContainerId(), absDelta);
resourceDecreased = true;
}
} }
// Notify parent if (resourceDecreased) {
if (getParent() != null) { // Notify parent queue outside of leaf queue lock
getParent().decreaseContainer(clusterResource, decreaseRequest, app); getParent().decreaseContainer(clusterResource, decreaseRequest, app);
LOG.info("Application attempt " + app.getApplicationAttemptId()
+ " decreased container:" + decreaseRequest.getContainerId()
+ " from " + resourceBeforeDecrease + " to "
+ decreaseRequest.getTargetCapacity());
} }
} }

View File

@ -46,6 +46,7 @@ import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.yarn.api.records.QueueState;
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo; import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.security.AccessType; import org.apache.hadoop.yarn.security.AccessType;
@ -656,7 +657,8 @@ public class ParentQueue extends AbstractCSQueue {
@Override @Override
public void decreaseContainer(Resource clusterResource, public void decreaseContainer(Resource clusterResource,
SchedContainerChangeRequest decreaseRequest, FiCaSchedulerApp app) { SchedContainerChangeRequest decreaseRequest, FiCaSchedulerApp app)
throws InvalidResourceRequestException {
// delta capacity is negative when it's a decrease request // delta capacity is negative when it's a decrease request
Resource absDeltaCapacity = Resource absDeltaCapacity =
Resources.negate(decreaseRequest.getDeltaCapacity()); Resources.negate(decreaseRequest.getDeltaCapacity());

View File

@ -21,8 +21,11 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
@ -47,8 +50,12 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerStat
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler
.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica
.FiCaSchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
@ -57,12 +64,48 @@ import org.junit.Before;
import org.junit.Test; import org.junit.Test;
public class TestContainerResizing { public class TestContainerResizing {
private static final Log LOG = LogFactory.getLog(TestContainerResizing.class);
private final int GB = 1024; private final int GB = 1024;
private YarnConfiguration conf; private YarnConfiguration conf;
RMNodeLabelsManager mgr; RMNodeLabelsManager mgr;
class MyScheduler extends CapacityScheduler {
/*
* A Mock Scheduler to simulate the potential effect of deadlock between:
* 1. The AbstractYarnScheduler.decreaseContainers() call (from
* ApplicationMasterService thread)
* 2. The CapacityScheduler.allocateContainersToNode() call (from the
* scheduler thread)
*/
MyScheduler() {
super();
}
@Override
protected void decreaseContainers(
List<ContainerResourceChangeRequest> decreaseRequests,
SchedulerApplicationAttempt attempt) {
try {
Thread.sleep(1000);
} catch(InterruptedException e) {
LOG.debug("Thread interrupted.");
}
super.decreaseContainers(decreaseRequests, attempt);
}
@Override
public synchronized void allocateContainersToNode(FiCaSchedulerNode node) {
try {
Thread.sleep(1000);
} catch(InterruptedException e) {
LOG.debug("Thread interrupted.");
}
super.allocateContainersToNode(node);
}
}
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
conf = new YarnConfiguration(); conf = new YarnConfiguration();
@ -958,6 +1001,50 @@ public class TestContainerResizing {
rm1.close(); rm1.close();
} }
@Test (timeout = 60000)
public void testDecreaseContainerWillNotDeadlockContainerAllocation()
throws Exception {
// create and start MockRM with our MyScheduler
MockRM rm = new MockRM() {
@Override
public ResourceScheduler createScheduler() {
CapacityScheduler cs = new MyScheduler();
cs.setConf(conf);
return cs;
}
};
rm.start();
// register a node
MockNM nm = rm.registerNode("h1:1234", 20 * GB);
// submit an application -> app1
RMApp app1 = rm.submitApp(3 * GB, "app", "user", null, "default");
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm, nm);
// making sure resource is allocated
checkUsedResource(rm, "default", 3 * GB, null);
FiCaSchedulerApp app = getFiCaSchedulerApp(rm, app1.getApplicationId());
Assert.assertEquals(3 * GB,
app.getAppAttemptResourceUsage().getUsed().getMemory());
// making sure container is launched
ContainerId containerId1 =
ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
sentRMContainerLaunched(rm, containerId1);
// submit allocation request for a new container
am1.allocate(Collections.singletonList(ResourceRequest.newInstance(
Priority.newInstance(1), "*", Resources.createResource(2 * GB), 1)),
null);
// nm reports status update and triggers container allocation
nm.nodeHeartbeat(true);
// *In the mean time*, am1 asks to decrease its AM container resource from
// 3GB to 1GB
AllocateResponse response = am1.sendContainerResizingRequest(null,
Collections.singletonList(ContainerResourceChangeRequest
.newInstance(containerId1, Resources.createResource(GB))));
// verify that the containe resource is decreased
verifyContainerDecreased(response, containerId1, GB);
rm.close();
}
private void checkPendingResource(MockRM rm, String queueName, int memory, private void checkPendingResource(MockRM rm, String queueName, int memory,
String label) { String label) {
CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler(); CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();