YARN-4519. Potential deadlock of CapacityScheduler between decrease container and assign containers. Contributed Meng Ding
This commit is contained in:
parent
47746f65b0
commit
145add1aec
|
@ -53,9 +53,10 @@ import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler
|
||||
.SchedContainerChangeRequest;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
|
||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||
|
@ -116,41 +117,23 @@ public class RMServerUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Normalize container increase/decrease request, it will normalize and update
|
||||
* ContainerResourceChangeRequest.targetResource
|
||||
* Validate increase/decrease request. This function must be called under
|
||||
* the queue lock to make sure that the access to container resource is
|
||||
* atomic. Refer to LeafQueue.decreaseContainer() and
|
||||
* CapacityScheduelr.updateIncreaseRequests()
|
||||
*
|
||||
*
|
||||
* <pre>
|
||||
* - Throw exception when any other error happens
|
||||
* </pre>
|
||||
*/
|
||||
public static void checkAndNormalizeContainerChangeRequest(
|
||||
RMContext rmContext, ContainerResourceChangeRequest request,
|
||||
boolean increase) throws InvalidResourceRequestException {
|
||||
public static void checkSchedContainerChangeRequest(
|
||||
SchedContainerChangeRequest request, boolean increase)
|
||||
throws InvalidResourceRequestException {
|
||||
RMContext rmContext = request.getRmContext();
|
||||
ContainerId containerId = request.getContainerId();
|
||||
ResourceScheduler scheduler = rmContext.getScheduler();
|
||||
RMContainer rmContainer = scheduler.getRMContainer(containerId);
|
||||
ResourceCalculator rc = scheduler.getResourceCalculator();
|
||||
|
||||
if (null == rmContainer) {
|
||||
String msg =
|
||||
"Failed to get rmContainer for "
|
||||
+ (increase ? "increase" : "decrease")
|
||||
+ " request, with container-id=" + containerId;
|
||||
throw new InvalidResourceRequestException(msg);
|
||||
}
|
||||
|
||||
if (rmContainer.getState() != RMContainerState.RUNNING) {
|
||||
String msg =
|
||||
"rmContainer's state is not RUNNING, for "
|
||||
+ (increase ? "increase" : "decrease")
|
||||
+ " request, with container-id=" + containerId;
|
||||
throw new InvalidResourceRequestException(msg);
|
||||
}
|
||||
|
||||
Resource targetResource = Resources.normalize(rc, request.getCapability(),
|
||||
scheduler.getMinimumResourceCapability(),
|
||||
scheduler.getMaximumResourceCapability(),
|
||||
scheduler.getMinimumResourceCapability());
|
||||
RMContainer rmContainer = request.getRMContainer();
|
||||
Resource targetResource = request.getTargetCapacity();
|
||||
|
||||
// Compare targetResource and original resource
|
||||
Resource originalResource = rmContainer.getAllocatedResource();
|
||||
|
@ -182,9 +165,9 @@ public class RMServerUtils {
|
|||
}
|
||||
}
|
||||
|
||||
RMNode rmNode = rmContext.getRMNodes().get(rmContainer.getAllocatedNode());
|
||||
|
||||
// Target resource of the increase request is more than NM can offer
|
||||
ResourceScheduler scheduler = rmContext.getScheduler();
|
||||
RMNode rmNode = request.getSchedulerNode().getRMNode();
|
||||
if (!Resources.fitsIn(scheduler.getResourceCalculator(),
|
||||
scheduler.getClusterResource(), targetResource,
|
||||
rmNode.getTotalCapability())) {
|
||||
|
@ -193,9 +176,6 @@ public class RMServerUtils {
|
|||
+ rmNode.getTotalCapability();
|
||||
throw new InvalidResourceRequestException(msg);
|
||||
}
|
||||
|
||||
// Update normalized target resource
|
||||
request.setCapability(targetResource);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -254,6 +234,7 @@ public class RMServerUtils {
|
|||
}
|
||||
}
|
||||
|
||||
// Sanity check and normalize target resource
|
||||
private static void validateIncreaseDecreaseRequest(RMContext rmContext,
|
||||
List<ContainerResourceChangeRequest> requests, Resource maximumAllocation,
|
||||
boolean increase)
|
||||
|
@ -283,8 +264,23 @@ public class RMServerUtils {
|
|||
+ request.getCapability().getVirtualCores() + ", maxVirtualCores="
|
||||
+ maximumAllocation.getVirtualCores());
|
||||
}
|
||||
|
||||
checkAndNormalizeContainerChangeRequest(rmContext, request, increase);
|
||||
ContainerId containerId = request.getContainerId();
|
||||
ResourceScheduler scheduler = rmContext.getScheduler();
|
||||
RMContainer rmContainer = scheduler.getRMContainer(containerId);
|
||||
if (null == rmContainer) {
|
||||
String msg =
|
||||
"Failed to get rmContainer for "
|
||||
+ (increase ? "increase" : "decrease")
|
||||
+ " request, with container-id=" + containerId;
|
||||
throw new InvalidResourceRequestException(msg);
|
||||
}
|
||||
ResourceCalculator rc = scheduler.getResourceCalculator();
|
||||
Resource targetResource = Resources.normalize(rc, request.getCapability(),
|
||||
scheduler.getMinimumResourceCapability(),
|
||||
scheduler.getMaximumResourceCapability(),
|
||||
scheduler.getMinimumResourceCapability());
|
||||
// Update normalized target resource
|
||||
request.setCapability(targetResource);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -55,13 +54,13 @@ import org.apache.hadoop.yarn.api.records.Resource;
|
|||
import org.apache.hadoop.yarn.api.records.ResourceOption;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
||||
|
@ -74,6 +73,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEven
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent;
|
||||
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeDecreaseContainerEvent;
|
||||
|
@ -605,26 +605,18 @@ public abstract class AbstractYarnScheduler
|
|||
}
|
||||
|
||||
protected void decreaseContainers(
|
||||
List<SchedContainerChangeRequest> decreaseRequests,
|
||||
List<ContainerResourceChangeRequest> decreaseRequests,
|
||||
SchedulerApplicationAttempt attempt) {
|
||||
for (SchedContainerChangeRequest request : decreaseRequests) {
|
||||
if (null == decreaseRequests || decreaseRequests.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
// Pre-process decrease requests
|
||||
List<SchedContainerChangeRequest> schedDecreaseRequests =
|
||||
createSchedContainerChangeRequests(decreaseRequests, false);
|
||||
for (SchedContainerChangeRequest request : schedDecreaseRequests) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Processing decrease request:" + request);
|
||||
}
|
||||
|
||||
boolean hasIncreaseRequest =
|
||||
attempt.removeIncreaseRequest(request.getNodeId(),
|
||||
request.getPriority(), request.getContainerId());
|
||||
|
||||
if (hasIncreaseRequest) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("While processing decrease request, found a increase request "
|
||||
+ "for the same container "
|
||||
+ request.getContainerId()
|
||||
+ ", removed the increase request");
|
||||
}
|
||||
}
|
||||
|
||||
// handle decrease request
|
||||
decreaseContainer(request, attempt);
|
||||
}
|
||||
|
@ -862,7 +854,7 @@ public abstract class AbstractYarnScheduler
|
|||
}
|
||||
|
||||
/**
|
||||
* Normalize container increase/decrease request, and return
|
||||
* Sanity check increase/decrease request, and return
|
||||
* SchedulerContainerResourceChangeRequest according to given
|
||||
* ContainerResourceChangeRequest.
|
||||
*
|
||||
|
@ -871,37 +863,34 @@ public abstract class AbstractYarnScheduler
|
|||
* - Throw exception when any other error happens
|
||||
* </pre>
|
||||
*/
|
||||
private SchedContainerChangeRequest
|
||||
checkAndNormalizeContainerChangeRequest(
|
||||
private SchedContainerChangeRequest createSchedContainerChangeRequest(
|
||||
ContainerResourceChangeRequest request, boolean increase)
|
||||
throws YarnException {
|
||||
// We have done a check in ApplicationMasterService, but RMContainer status
|
||||
// / Node resource could change since AMS won't acquire lock of scheduler.
|
||||
RMServerUtils.checkAndNormalizeContainerChangeRequest(rmContext, request,
|
||||
increase);
|
||||
ContainerId containerId = request.getContainerId();
|
||||
RMContainer rmContainer = getRMContainer(containerId);
|
||||
if (null == rmContainer) {
|
||||
String msg =
|
||||
"Failed to get rmContainer for "
|
||||
+ (increase ? "increase" : "decrease")
|
||||
+ " request, with container-id=" + containerId;
|
||||
throw new InvalidResourceRequestException(msg);
|
||||
}
|
||||
SchedulerNode schedulerNode =
|
||||
getSchedulerNode(rmContainer.getAllocatedNode());
|
||||
|
||||
return new SchedContainerChangeRequest(schedulerNode, rmContainer,
|
||||
request.getCapability());
|
||||
return new SchedContainerChangeRequest(
|
||||
this.rmContext, schedulerNode, rmContainer, request.getCapability());
|
||||
}
|
||||
|
||||
protected List<SchedContainerChangeRequest>
|
||||
checkAndNormalizeContainerChangeRequests(
|
||||
createSchedContainerChangeRequests(
|
||||
List<ContainerResourceChangeRequest> changeRequests,
|
||||
boolean increase) {
|
||||
if (null == changeRequests || changeRequests.isEmpty()) {
|
||||
return Collections.EMPTY_LIST;
|
||||
}
|
||||
|
||||
List<SchedContainerChangeRequest> schedulerChangeRequests =
|
||||
new ArrayList<SchedContainerChangeRequest>();
|
||||
for (ContainerResourceChangeRequest r : changeRequests) {
|
||||
SchedContainerChangeRequest sr = null;
|
||||
try {
|
||||
sr = checkAndNormalizeContainerChangeRequest(r, increase);
|
||||
sr = createSchedContainerChangeRequest(r, increase);
|
||||
} catch (YarnException e) {
|
||||
LOG.warn("Error happens when checking increase request, Ignoring.."
|
||||
+ " exception=", e);
|
||||
|
@ -909,7 +898,6 @@ public abstract class AbstractYarnScheduler
|
|||
}
|
||||
schedulerChangeRequests.add(sr);
|
||||
}
|
||||
|
||||
return schedulerChangeRequests;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,8 @@ import org.apache.hadoop.yarn.api.records.NodeId;
|
|||
import org.apache.hadoop.yarn.api.records.Priority;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||
|
@ -147,6 +149,18 @@ public class AppSchedulingInfo {
|
|||
boolean resourceUpdated = false;
|
||||
|
||||
for (SchedContainerChangeRequest r : increaseRequests) {
|
||||
if (r.getRMContainer().getState() != RMContainerState.RUNNING) {
|
||||
LOG.warn("rmContainer's state is not RUNNING, for increase request with"
|
||||
+ " container-id=" + r.getContainerId());
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
RMServerUtils.checkSchedContainerChangeRequest(r, true);
|
||||
} catch (YarnException e) {
|
||||
LOG.warn("Error happens when checking increase request, Ignoring.."
|
||||
+ " exception=", e);
|
||||
continue;
|
||||
}
|
||||
NodeId nodeId = r.getRMContainer().getAllocatedNode();
|
||||
|
||||
Map<Priority, Map<ContainerId, SchedContainerChangeRequest>> requestsOnNode =
|
||||
|
@ -220,7 +234,7 @@ public class AppSchedulingInfo {
|
|||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Added increase request:" + request.getContainerId()
|
||||
+ " delta=" + request.getDeltaCapacity());
|
||||
+ " delta=" + delta);
|
||||
}
|
||||
|
||||
// update priorities
|
||||
|
@ -509,24 +523,20 @@ public class AppSchedulingInfo {
|
|||
NodeId nodeId = increaseRequest.getNodeId();
|
||||
Priority priority = increaseRequest.getPriority();
|
||||
ContainerId containerId = increaseRequest.getContainerId();
|
||||
Resource deltaCapacity = increaseRequest.getDeltaCapacity();
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("allocated increase request : applicationId=" + applicationId
|
||||
+ " container=" + containerId + " host="
|
||||
+ increaseRequest.getNodeId() + " user=" + user + " resource="
|
||||
+ increaseRequest.getDeltaCapacity());
|
||||
+ deltaCapacity);
|
||||
}
|
||||
|
||||
// Set queue metrics
|
||||
queue.getMetrics().allocateResources(user,
|
||||
increaseRequest.getDeltaCapacity());
|
||||
|
||||
queue.getMetrics().allocateResources(user, deltaCapacity);
|
||||
// remove the increase request from pending increase request map
|
||||
removeIncreaseRequest(nodeId, priority, containerId);
|
||||
|
||||
// update usage
|
||||
appResourceUsage.incUsed(increaseRequest.getNodePartition(),
|
||||
increaseRequest.getDeltaCapacity());
|
||||
appResourceUsage.incUsed(increaseRequest.getNodePartition(), deltaCapacity);
|
||||
}
|
||||
|
||||
public synchronized void decreaseContainer(
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
|
|||
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||
import org.apache.hadoop.yarn.api.records.Priority;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||
|
||||
|
@ -32,18 +33,19 @@ import org.apache.hadoop.yarn.util.resource.Resources;
|
|||
*/
|
||||
public class SchedContainerChangeRequest implements
|
||||
Comparable<SchedContainerChangeRequest> {
|
||||
RMContainer rmContainer;
|
||||
Resource targetCapacity;
|
||||
SchedulerNode schedulerNode;
|
||||
Resource deltaCapacity;
|
||||
private RMContext rmContext;
|
||||
private RMContainer rmContainer;
|
||||
private Resource targetCapacity;
|
||||
private SchedulerNode schedulerNode;
|
||||
private Resource deltaCapacity;
|
||||
|
||||
public SchedContainerChangeRequest(SchedulerNode schedulerNode,
|
||||
public SchedContainerChangeRequest(
|
||||
RMContext rmContext, SchedulerNode schedulerNode,
|
||||
RMContainer rmContainer, Resource targetCapacity) {
|
||||
this.rmContext = rmContext;
|
||||
this.rmContainer = rmContainer;
|
||||
this.targetCapacity = targetCapacity;
|
||||
this.schedulerNode = schedulerNode;
|
||||
deltaCapacity = Resources.subtract(targetCapacity,
|
||||
rmContainer.getAllocatedResource());
|
||||
}
|
||||
|
||||
public NodeId getNodeId() {
|
||||
|
@ -58,11 +60,19 @@ public class SchedContainerChangeRequest implements
|
|||
return this.targetCapacity;
|
||||
}
|
||||
|
||||
public RMContext getRmContext() {
|
||||
return this.rmContext;
|
||||
}
|
||||
/**
|
||||
* Delta capacity = before - target, so if it is a decrease request, delta
|
||||
* Delta capacity = target - before, so if it is a decrease request, delta
|
||||
* capacity will be negative
|
||||
*/
|
||||
public Resource getDeltaCapacity() {
|
||||
public synchronized Resource getDeltaCapacity() {
|
||||
// Only calculate deltaCapacity once
|
||||
if (deltaCapacity == null) {
|
||||
deltaCapacity = Resources.subtract(
|
||||
targetCapacity, rmContainer.getAllocatedResource());
|
||||
}
|
||||
return deltaCapacity;
|
||||
}
|
||||
|
||||
|
@ -112,7 +122,6 @@ public class SchedContainerChangeRequest implements
|
|||
@Override
|
||||
public String toString() {
|
||||
return "<container=" + getContainerId() + ", targetCapacity="
|
||||
+ targetCapacity + ", delta=" + deltaCapacity + ", node="
|
||||
+ getNodeId().toString() + ">";
|
||||
+ targetCapacity + ", node=" + getNodeId().toString() + ">";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,6 +33,8 @@ import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
|||
import org.apache.hadoop.yarn.api.records.QueueACL;
|
||||
import org.apache.hadoop.yarn.api.records.QueueState;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||
import org.apache.hadoop.yarn.security.PrivilegedEntity;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
|
||||
|
@ -329,7 +331,7 @@ extends org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue {
|
|||
*/
|
||||
public void decreaseContainer(Resource clusterResource,
|
||||
SchedContainerChangeRequest decreaseRequest,
|
||||
FiCaSchedulerApp app);
|
||||
FiCaSchedulerApp app) throws InvalidResourceRequestException;
|
||||
|
||||
/**
|
||||
* Get valid Node Labels for this queue
|
||||
|
|
|
@ -21,8 +21,8 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
|
@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
|
|||
import org.apache.hadoop.yarn.api.records.ResourceOption;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
|
||||
|
@ -892,9 +893,36 @@ public class CapacityScheduler extends
|
|||
}
|
||||
}
|
||||
|
||||
// It is crucial to acquire leaf queue lock first to prevent:
|
||||
// 1. Race condition when calculating the delta resource in
|
||||
// SchedContainerChangeRequest
|
||||
// 2. Deadlock with the scheduling thread.
|
||||
private LeafQueue updateIncreaseRequests(
|
||||
List<ContainerResourceChangeRequest> increaseRequests,
|
||||
FiCaSchedulerApp app) {
|
||||
if (null == increaseRequests || increaseRequests.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
// Pre-process increase requests
|
||||
List<SchedContainerChangeRequest> schedIncreaseRequests =
|
||||
createSchedContainerChangeRequests(increaseRequests, true);
|
||||
LeafQueue leafQueue = (LeafQueue) app.getQueue();
|
||||
synchronized(leafQueue) {
|
||||
// make sure we aren't stopping/removing the application
|
||||
// when the allocate comes in
|
||||
if (app.isStopped()) {
|
||||
return null;
|
||||
}
|
||||
// Process increase resource requests
|
||||
if (app.updateIncreaseRequests(schedIncreaseRequests)) {
|
||||
return leafQueue;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
// Note: when AM asks to decrease container or release container, we will
|
||||
// acquire scheduler lock
|
||||
// Note: when AM asks to release container, we will acquire scheduler lock
|
||||
@Lock(Lock.NoLock.class)
|
||||
public Allocation allocate(ApplicationAttemptId applicationAttemptId,
|
||||
List<ResourceRequest> ask, List<ContainerId> release,
|
||||
|
@ -907,26 +935,23 @@ public class CapacityScheduler extends
|
|||
return EMPTY_ALLOCATION;
|
||||
}
|
||||
|
||||
// Sanity check
|
||||
// Release containers
|
||||
releaseContainers(release, application);
|
||||
|
||||
// update increase requests
|
||||
LeafQueue updateDemandForQueue =
|
||||
updateIncreaseRequests(increaseRequests, application);
|
||||
|
||||
// Decrease containers
|
||||
decreaseContainers(decreaseRequests, application);
|
||||
|
||||
// Sanity check for new allocation requests
|
||||
SchedulerUtils.normalizeRequests(
|
||||
ask, getResourceCalculator(), getClusterResource(),
|
||||
getMinimumResourceCapability(), getMaximumResourceCapability());
|
||||
|
||||
// Pre-process increase requests
|
||||
List<SchedContainerChangeRequest> normalizedIncreaseRequests =
|
||||
checkAndNormalizeContainerChangeRequests(increaseRequests, true);
|
||||
|
||||
// Pre-process decrease requests
|
||||
List<SchedContainerChangeRequest> normalizedDecreaseRequests =
|
||||
checkAndNormalizeContainerChangeRequests(decreaseRequests, false);
|
||||
|
||||
// Release containers
|
||||
releaseContainers(release, application);
|
||||
|
||||
Allocation allocation;
|
||||
|
||||
LeafQueue updateDemandForQueue = null;
|
||||
|
||||
synchronized (application) {
|
||||
|
||||
// make sure we aren't stopping/removing the application
|
||||
|
@ -944,7 +969,8 @@ public class CapacityScheduler extends
|
|||
}
|
||||
|
||||
// Update application requests
|
||||
if (application.updateResourceRequests(ask)) {
|
||||
if (application.updateResourceRequests(ask)
|
||||
&& (updateDemandForQueue == null)) {
|
||||
updateDemandForQueue = (LeafQueue) application.getQueue();
|
||||
}
|
||||
|
||||
|
@ -954,12 +980,6 @@ public class CapacityScheduler extends
|
|||
}
|
||||
}
|
||||
|
||||
// Process increase resource requests
|
||||
if (application.updateIncreaseRequests(normalizedIncreaseRequests)
|
||||
&& (updateDemandForQueue == null)) {
|
||||
updateDemandForQueue = (LeafQueue) application.getQueue();
|
||||
}
|
||||
|
||||
if (application.isWaitingForAMContainer()) {
|
||||
// Allocate is for AM and update AM blacklist for this
|
||||
application.updateAMBlacklist(
|
||||
|
@ -968,8 +988,6 @@ public class CapacityScheduler extends
|
|||
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
|
||||
}
|
||||
|
||||
// Decrease containers
|
||||
decreaseContainers(normalizedDecreaseRequests, application);
|
||||
|
||||
allocation = application.getAllocation(getResourceCalculator(),
|
||||
clusterResource, getMinimumResourceCapability());
|
||||
|
@ -1164,7 +1182,8 @@ public class CapacityScheduler extends
|
|||
.getAssignmentInformation().getReserved());
|
||||
}
|
||||
|
||||
private synchronized void allocateContainersToNode(FiCaSchedulerNode node) {
|
||||
@VisibleForTesting
|
||||
protected synchronized void allocateContainersToNode(FiCaSchedulerNode node) {
|
||||
if (rmContext.isWorkPreservingRecoveryEnabled()
|
||||
&& !rmContext.isSchedulerReadyForAllocatingContainers()) {
|
||||
return;
|
||||
|
@ -1514,48 +1533,30 @@ public class CapacityScheduler extends
|
|||
}
|
||||
}
|
||||
|
||||
@Lock(CapacityScheduler.class)
|
||||
@Override
|
||||
protected synchronized void decreaseContainer(
|
||||
SchedContainerChangeRequest decreaseRequest,
|
||||
protected void decreaseContainer(SchedContainerChangeRequest decreaseRequest,
|
||||
SchedulerApplicationAttempt attempt) {
|
||||
RMContainer rmContainer = decreaseRequest.getRMContainer();
|
||||
|
||||
// Check container status before doing decrease
|
||||
if (rmContainer.getState() != RMContainerState.RUNNING) {
|
||||
LOG.info("Trying to decrease a container not in RUNNING state, container="
|
||||
+ rmContainer + " state=" + rmContainer.getState().name());
|
||||
return;
|
||||
}
|
||||
|
||||
// Delta capacity of this decrease request is 0, this decrease request may
|
||||
// just to cancel increase request
|
||||
if (Resources.equals(decreaseRequest.getDeltaCapacity(), Resources.none())) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Decrease target resource equals to existing resource for container:"
|
||||
+ decreaseRequest.getContainerId()
|
||||
+ " ignore this decrease request.");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Save resource before decrease
|
||||
Resource resourceBeforeDecrease =
|
||||
Resources.clone(rmContainer.getContainer().getResource());
|
||||
|
||||
FiCaSchedulerApp app = (FiCaSchedulerApp)attempt;
|
||||
LeafQueue queue = (LeafQueue) attempt.getQueue();
|
||||
try {
|
||||
queue.decreaseContainer(clusterResource, decreaseRequest, app);
|
||||
|
||||
// Notify RMNode the container will be decreased
|
||||
// Notify RMNode that the container can be pulled by NodeManager in the
|
||||
// next heartbeat
|
||||
this.rmContext.getDispatcher().getEventHandler()
|
||||
.handle(new RMNodeDecreaseContainerEvent(decreaseRequest.getNodeId(),
|
||||
Arrays.asList(rmContainer.getContainer())));
|
||||
|
||||
LOG.info("Application attempt " + app.getApplicationAttemptId()
|
||||
+ " decreased container:" + decreaseRequest.getContainerId() + " from "
|
||||
+ resourceBeforeDecrease + " to "
|
||||
+ decreaseRequest.getTargetCapacity());
|
||||
.handle(new RMNodeDecreaseContainerEvent(
|
||||
decreaseRequest.getNodeId(),
|
||||
Collections.singletonList(rmContainer.getContainer())));
|
||||
} catch (InvalidResourceRequestException e) {
|
||||
LOG.warn("Error happens when checking decrease request, Ignoring.."
|
||||
+ " exception=", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Lock(Lock.NoLock.class)
|
||||
|
|
|
@ -47,10 +47,12 @@ import org.apache.hadoop.yarn.api.records.QueueInfo;
|
|||
import org.apache.hadoop.yarn.api.records.QueueState;
|
||||
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
|
||||
import org.apache.hadoop.yarn.security.AccessType;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||
|
@ -1659,10 +1661,16 @@ public class LeafQueue extends AbstractCSQueue {
|
|||
return defaultAppPriorityPerQueue;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param clusterResource Total cluster resource
|
||||
* @param decreaseRequest The decrease request
|
||||
* @param app The application of interest
|
||||
*/
|
||||
@Override
|
||||
public void decreaseContainer(Resource clusterResource,
|
||||
SchedContainerChangeRequest decreaseRequest,
|
||||
FiCaSchedulerApp app) {
|
||||
FiCaSchedulerApp app) throws InvalidResourceRequestException {
|
||||
// If the container being decreased is reserved, we need to unreserve it
|
||||
// first.
|
||||
RMContainer rmContainer = decreaseRequest.getRMContainer();
|
||||
|
@ -1670,25 +1678,62 @@ public class LeafQueue extends AbstractCSQueue {
|
|||
unreserveIncreasedContainer(clusterResource, app,
|
||||
(FiCaSchedulerNode)decreaseRequest.getSchedulerNode(), rmContainer);
|
||||
}
|
||||
|
||||
boolean resourceDecreased = false;
|
||||
Resource resourceBeforeDecrease;
|
||||
// Grab queue lock to avoid race condition when getting container resource
|
||||
synchronized (this) {
|
||||
// Make sure the decrease request is valid in terms of current resource
|
||||
// and target resource. This must be done under the leaf queue lock.
|
||||
// Throws exception if the check fails.
|
||||
RMServerUtils.checkSchedContainerChangeRequest(decreaseRequest, false);
|
||||
// Save resource before decrease for debug log
|
||||
resourceBeforeDecrease =
|
||||
Resources.clone(rmContainer.getAllocatedResource());
|
||||
// Do we have increase request for the same container? If so, remove it
|
||||
boolean hasIncreaseRequest =
|
||||
app.removeIncreaseRequest(decreaseRequest.getNodeId(),
|
||||
decreaseRequest.getPriority(), decreaseRequest.getContainerId());
|
||||
if (hasIncreaseRequest) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("While processing decrease requests, found an increase"
|
||||
+ " request for the same container "
|
||||
+ decreaseRequest.getContainerId()
|
||||
+ ", removed the increase request");
|
||||
}
|
||||
}
|
||||
// Delta capacity is negative when it's a decrease request
|
||||
Resource absDelta = Resources.negate(decreaseRequest.getDeltaCapacity());
|
||||
|
||||
synchronized (this) {
|
||||
// Delta is negative when it's a decrease request
|
||||
if (Resources.equals(absDelta, Resources.none())) {
|
||||
// If delta capacity of this decrease request is 0, this decrease
|
||||
// request serves the purpose of cancelling an existing increase request
|
||||
// if any
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Decrease target resource equals to existing resource for"
|
||||
+ " container:" + decreaseRequest.getContainerId()
|
||||
+ " ignore this decrease request.");
|
||||
}
|
||||
} else {
|
||||
// Release the delta resource
|
||||
releaseResource(clusterResource, app, absDelta,
|
||||
decreaseRequest.getNodePartition(), decreaseRequest.getRMContainer(),
|
||||
decreaseRequest.getNodePartition(),
|
||||
decreaseRequest.getRMContainer(),
|
||||
true);
|
||||
// Notify application
|
||||
app.decreaseContainer(decreaseRequest);
|
||||
// Notify node
|
||||
decreaseRequest.getSchedulerNode()
|
||||
.decreaseContainer(decreaseRequest.getContainerId(), absDelta);
|
||||
resourceDecreased = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Notify parent
|
||||
if (getParent() != null) {
|
||||
if (resourceDecreased) {
|
||||
// Notify parent queue outside of leaf queue lock
|
||||
getParent().decreaseContainer(clusterResource, decreaseRequest, app);
|
||||
LOG.info("Application attempt " + app.getApplicationAttemptId()
|
||||
+ " decreased container:" + decreaseRequest.getContainerId()
|
||||
+ " from " + resourceBeforeDecrease + " to "
|
||||
+ decreaseRequest.getTargetCapacity());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.hadoop.yarn.api.records.QueueInfo;
|
|||
import org.apache.hadoop.yarn.api.records.QueueState;
|
||||
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.security.AccessType;
|
||||
|
@ -656,7 +657,8 @@ public class ParentQueue extends AbstractCSQueue {
|
|||
|
||||
@Override
|
||||
public void decreaseContainer(Resource clusterResource,
|
||||
SchedContainerChangeRequest decreaseRequest, FiCaSchedulerApp app) {
|
||||
SchedContainerChangeRequest decreaseRequest, FiCaSchedulerApp app)
|
||||
throws InvalidResourceRequestException {
|
||||
// delta capacity is negative when it's a decrease request
|
||||
Resource absDeltaCapacity =
|
||||
Resources.negate(decreaseRequest.getDeltaCapacity());
|
||||
|
|
|
@ -21,8 +21,11 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
|
@ -47,8 +50,12 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerStat
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler
|
||||
.SchedulerApplicationAttempt;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica
|
||||
.FiCaSchedulerNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
|
||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||
|
@ -57,12 +64,48 @@ import org.junit.Before;
|
|||
import org.junit.Test;
|
||||
|
||||
public class TestContainerResizing {
|
||||
private static final Log LOG = LogFactory.getLog(TestContainerResizing.class);
|
||||
private final int GB = 1024;
|
||||
|
||||
private YarnConfiguration conf;
|
||||
|
||||
RMNodeLabelsManager mgr;
|
||||
|
||||
class MyScheduler extends CapacityScheduler {
|
||||
/*
|
||||
* A Mock Scheduler to simulate the potential effect of deadlock between:
|
||||
* 1. The AbstractYarnScheduler.decreaseContainers() call (from
|
||||
* ApplicationMasterService thread)
|
||||
* 2. The CapacityScheduler.allocateContainersToNode() call (from the
|
||||
* scheduler thread)
|
||||
*/
|
||||
MyScheduler() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void decreaseContainers(
|
||||
List<ContainerResourceChangeRequest> decreaseRequests,
|
||||
SchedulerApplicationAttempt attempt) {
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch(InterruptedException e) {
|
||||
LOG.debug("Thread interrupted.");
|
||||
}
|
||||
super.decreaseContainers(decreaseRequests, attempt);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void allocateContainersToNode(FiCaSchedulerNode node) {
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch(InterruptedException e) {
|
||||
LOG.debug("Thread interrupted.");
|
||||
}
|
||||
super.allocateContainersToNode(node);
|
||||
}
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
conf = new YarnConfiguration();
|
||||
|
@ -958,6 +1001,50 @@ public class TestContainerResizing {
|
|||
rm1.close();
|
||||
}
|
||||
|
||||
@Test (timeout = 60000)
|
||||
public void testDecreaseContainerWillNotDeadlockContainerAllocation()
|
||||
throws Exception {
|
||||
// create and start MockRM with our MyScheduler
|
||||
MockRM rm = new MockRM() {
|
||||
@Override
|
||||
public ResourceScheduler createScheduler() {
|
||||
CapacityScheduler cs = new MyScheduler();
|
||||
cs.setConf(conf);
|
||||
return cs;
|
||||
}
|
||||
};
|
||||
rm.start();
|
||||
// register a node
|
||||
MockNM nm = rm.registerNode("h1:1234", 20 * GB);
|
||||
// submit an application -> app1
|
||||
RMApp app1 = rm.submitApp(3 * GB, "app", "user", null, "default");
|
||||
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm, nm);
|
||||
// making sure resource is allocated
|
||||
checkUsedResource(rm, "default", 3 * GB, null);
|
||||
FiCaSchedulerApp app = getFiCaSchedulerApp(rm, app1.getApplicationId());
|
||||
Assert.assertEquals(3 * GB,
|
||||
app.getAppAttemptResourceUsage().getUsed().getMemory());
|
||||
// making sure container is launched
|
||||
ContainerId containerId1 =
|
||||
ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
|
||||
sentRMContainerLaunched(rm, containerId1);
|
||||
// submit allocation request for a new container
|
||||
am1.allocate(Collections.singletonList(ResourceRequest.newInstance(
|
||||
Priority.newInstance(1), "*", Resources.createResource(2 * GB), 1)),
|
||||
null);
|
||||
// nm reports status update and triggers container allocation
|
||||
nm.nodeHeartbeat(true);
|
||||
// *In the mean time*, am1 asks to decrease its AM container resource from
|
||||
// 3GB to 1GB
|
||||
AllocateResponse response = am1.sendContainerResizingRequest(null,
|
||||
Collections.singletonList(ContainerResourceChangeRequest
|
||||
.newInstance(containerId1, Resources.createResource(GB))));
|
||||
// verify that the containe resource is decreased
|
||||
verifyContainerDecreased(response, containerId1, GB);
|
||||
|
||||
rm.close();
|
||||
}
|
||||
|
||||
private void checkPendingResource(MockRM rm, String queueName, int memory,
|
||||
String label) {
|
||||
CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
|
||||
|
|
Loading…
Reference in New Issue