YARN-3655. FairScheduler: potential livelock due to maxAMShare limitation and container reservation. (Zhihai Xu via kasha)

This commit is contained in:
Karthik Kambatla 2015-06-07 11:37:52 -07:00
parent b61b489995
commit bd69ea408f
5 changed files with 379 additions and 93 deletions

View File

@ -498,6 +498,9 @@ Release 2.8.0 - UNRELEASED
YARN-3766. Fixed the apps table column error of generic history web UI.
(Xuan Gong via zjshen)
YARN-3655. FairScheduler: potential livelock due to maxAMShare limitation
and container reservation. (Zhihai Xu via kasha)
Release 2.7.1 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -541,9 +541,6 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
}
return container.getResource();
} else {
if (!FairScheduler.fitsInMaxShare(getQueue(), capability)) {
return Resources.none();
}
// The desired container won't fit here, so reserve
@ -551,28 +548,29 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
return FairScheduler.CONTAINER_RESERVED;
}
}
private boolean hasNodeOrRackLocalRequests(Priority priority) {
return getResourceRequests(priority).size() > 1;
}
private Resource assignContainer(FSSchedulerNode node, boolean reserved) {
if (LOG.isDebugEnabled()) {
LOG.debug("Node offered to app: " + getName() + " reserved: " + reserved);
}
/**
* Whether the AM container for this app is over maxAMShare limit.
*/
private boolean isOverAMShareLimit() {
// Check the AM resource usage for the leaf queue
if (!isAmRunning() && !getUnmanagedAM()) {
List<ResourceRequest> ask = appSchedulingInfo.getAllResourceRequests();
if (ask.isEmpty() || !getQueue().canRunAppAM(
ask.get(0).getCapability())) {
return true;
}
}
return false;
}
private Resource assignContainer(FSSchedulerNode node, boolean reserved) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping allocation because maxAMShare limit would " +
"be exceeded");
}
return Resources.none();
}
LOG.debug("Node offered to app: " + getName() + " reserved: " + reserved);
}
Collection<Priority> prioritiesToTry = (reserved) ?
@ -584,8 +582,9 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
// (not scheduled) in order to promote better locality.
synchronized (this) {
for (Priority priority : prioritiesToTry) {
if (getTotalRequiredResources(priority) <= 0 ||
!hasContainerForNode(priority, node)) {
// Skip it for reserved container, since
// we already check it in isValidReservation.
if (!reserved && !hasContainerForNode(priority, node)) {
continue;
}
@ -650,42 +649,11 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
return Resources.none();
}
/**
* Called when this application already has an existing reservation on the
* given node. Sees whether we can turn the reservation into an allocation.
* Also checks whether the application needs the reservation anymore, and
* releases it if not.
*
* @param node
* Node that the application has an existing reservation on
*/
public Resource assignReservedContainer(FSSchedulerNode node) {
RMContainer rmContainer = node.getReservedContainer();
Priority priority = rmContainer.getReservedPriority();
// Make sure the application still needs requests at this priority
if (getTotalRequiredResources(priority) == 0) {
unreserve(priority, node);
return Resources.none();
}
// Fail early if the reserved container won't fit.
// Note that we have an assumption here that there's only one container size
// per priority.
if (!Resources.fitsIn(node.getReservedContainer().getReservedResource(),
node.getAvailableResource())) {
return Resources.none();
}
return assignContainer(node, true);
}
/**
* Whether this app has containers requests that could be satisfied on the
* given node, if the node had full space.
*/
public boolean hasContainerForNode(Priority prio, FSSchedulerNode node) {
private boolean hasContainerForNode(Priority prio, FSSchedulerNode node) {
ResourceRequest anyRequest = getResourceRequest(prio, ResourceRequest.ANY);
ResourceRequest rackRequest = getResourceRequest(prio, node.getRackName());
ResourceRequest nodeRequest = getResourceRequest(prio, node.getNodeName());
@ -703,9 +671,56 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
(nodeRequest != null && nodeRequest.getNumContainers() > 0)) &&
// The requested container must be able to fit on the node:
Resources.lessThanOrEqual(RESOURCE_CALCULATOR, null,
anyRequest.getCapability(), node.getRMNode().getTotalCapability());
anyRequest.getCapability(),
node.getRMNode().getTotalCapability()) &&
// The requested container must fit in queue maximum share:
getQueue().fitsInMaxShare(anyRequest.getCapability());
}
private boolean isValidReservation(FSSchedulerNode node) {
Priority reservedPriority = node.getReservedContainer().
getReservedPriority();
return hasContainerForNode(reservedPriority, node) &&
!isOverAMShareLimit();
}
/**
* Called when this application already has an existing reservation on the
* given node. Sees whether we can turn the reservation into an allocation.
* Also checks whether the application needs the reservation anymore, and
* releases it if not.
*
* @param node
* Node that the application has an existing reservation on
* @return whether the reservation on the given node is valid.
*/
public boolean assignReservedContainer(FSSchedulerNode node) {
RMContainer rmContainer = node.getReservedContainer();
Priority reservedPriority = rmContainer.getReservedPriority();
if (!isValidReservation(node)) {
// Don't hold the reservation if app can no longer use it
LOG.info("Releasing reservation that cannot be satisfied for " +
"application " + getApplicationAttemptId() + " on node " + node);
unreserve(reservedPriority, node);
return false;
}
// Reservation valid; try to fulfill the reservation
if (LOG.isDebugEnabled()) {
LOG.debug("Trying to fulfill reservation for application "
+ getApplicationAttemptId() + " on node: " + node);
}
// Fail early if the reserved container won't fit.
// Note that we have an assumption here that
// there's only one container size per priority.
if (Resources.fitsIn(node.getReservedContainer().getReservedResource(),
node.getAvailableResource())) {
assignContainer(node, true);
}
return true;
}
static class RMContainerComparator implements Comparator<RMContainer>,
Serializable {
@ -795,6 +810,13 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
@Override
public Resource assignContainer(FSSchedulerNode node) {
if (isOverAMShareLimit()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping allocation because maxAMShare limit would " +
"be exceeded");
}
return Resources.none();
}
return assignContainer(node, false);
}

View File

@ -330,4 +330,19 @@ public abstract class FSQueue implements Queue, Schedulable {
@Override
public void decPendingResource(String nodeLabel, Resource resourceToDec) {
}
public boolean fitsInMaxShare(Resource additionalResource) {
Resource usagePlusAddition =
Resources.add(getResourceUsage(), additionalResource);
if (!Resources.fitsIn(usagePlusAddition, getMaxShare())) {
return false;
}
FSQueue parentQueue = getParent();
if (parentQueue != null) {
return parentQueue.fitsInMaxShare(additionalResource);
}
return true;
}
}

View File

@ -34,7 +34,6 @@ import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
@ -1075,31 +1074,12 @@ public class FairScheduler extends
// 1. Check for reserved applications
// 2. Schedule if there are no reservations
boolean validReservation = false;
FSAppAttempt reservedAppSchedulable = node.getReservedAppSchedulable();
if (reservedAppSchedulable != null) {
Priority reservedPriority = node.getReservedContainer().getReservedPriority();
FSQueue queue = reservedAppSchedulable.getQueue();
if (!reservedAppSchedulable.hasContainerForNode(reservedPriority, node)
|| !fitsInMaxShare(queue,
node.getReservedContainer().getReservedResource())) {
// Don't hold the reservation if app can no longer use it
LOG.info("Releasing reservation that cannot be satisfied for application "
+ reservedAppSchedulable.getApplicationAttemptId()
+ " on node " + node);
reservedAppSchedulable.unreserve(reservedPriority, node);
reservedAppSchedulable = null;
} else {
// Reservation exists; try to fulfill the reservation
if (LOG.isDebugEnabled()) {
LOG.debug("Trying to fulfill reservation for application "
+ reservedAppSchedulable.getApplicationAttemptId()
+ " on node: " + node);
validReservation = reservedAppSchedulable.assignReservedContainer(node);
}
node.getReservedAppSchedulable().assignReservedContainer(node);
}
}
if (reservedAppSchedulable == null) {
if (!validReservation) {
// No reservation, schedule at queue which is farthest below fair share
int assignedContainers = 0;
while (node.getReservedContainer() == null) {
@ -1117,22 +1097,6 @@ public class FairScheduler extends
updateRootQueueMetrics();
}
static boolean fitsInMaxShare(FSQueue queue, Resource
additionalResource) {
Resource usagePlusAddition =
Resources.add(queue.getResourceUsage(), additionalResource);
if (!Resources.fitsIn(usagePlusAddition, queue.getMaxShare())) {
return false;
}
FSQueue parentQueue = queue.getParent();
if (parentQueue != null) {
return fitsInMaxShare(parentQueue, additionalResource);
}
return true;
}
public FSAppAttempt getSchedulerApp(ApplicationAttemptId appAttemptId) {
return super.getApplicationAttempt(appAttemptId);
}

View File

@ -3701,6 +3701,288 @@ public class TestFairScheduler extends FairSchedulerTestBase {
0, queue2.getAmResourceUsage().getMemory());
}
/**
* The test verifies container gets reserved when not over maxAMShare,
* reserved container gets unreserved when over maxAMShare,
* container doesn't get reserved when over maxAMShare,
* reserved container is turned into an allocation and
* superfluously reserved container gets unreserved.
* 1. create three nodes: Node1 is 10G, Node2 is 10G and Node3 is 5G.
* 2. APP1 allocated 1G on Node1 and APP2 allocated 1G on Node2.
* 3. APP3 reserved 10G on Node1 and Node2.
* 4. APP4 allocated 5G on Node3, which makes APP3 over maxAMShare.
* 5. Remove APP1 to make Node1 have 10G available resource.
* 6. APP3 unreserved its container on Node1 because it is over maxAMShare.
* 7. APP5 allocated 1G on Node1 after APP3 unreserved its container.
* 8. Remove APP3.
* 9. APP6 failed to reserve a 10G container on Node1 due to AMShare limit.
* 10. APP7 allocated 1G on Node1.
* 11. Remove APP4 and APP5.
* 12. APP6 reserved 10G on Node1 and Node2.
* 13. APP8 failed to allocate a 1G container on Node1 and Node2 because
* APP6 reserved Node1 and Node2.
* 14. Remove APP2.
* 15. APP6 turned the 10G reservation into an allocation on node2.
* 16. APP6 unreserved its container on node1, APP8 allocated 1G on Node1.
*/
@Test
public void testQueueMaxAMShareWithContainerReservation() throws Exception {
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE));
out.println("<?xml version=\"1.0\"?>");
out.println("<allocations>");
out.println("<queue name=\"queue1\">");
out.println("<maxAMShare>0.5</maxAMShare>");
out.println("</queue>");
out.println("</allocations>");
out.close();
scheduler.init(conf);
scheduler.start();
scheduler.reinitialize(conf, resourceManager.getRMContext());
RMNode node1 =
MockNodes.newNodeInfo(1, Resources.createResource(10240, 10),
1, "127.0.0.1");
RMNode node2 =
MockNodes.newNodeInfo(1, Resources.createResource(10240, 10),
2, "127.0.0.2");
RMNode node3 =
MockNodes.newNodeInfo(1, Resources.createResource(5120, 5),
3, "127.0.0.3");
NodeAddedSchedulerEvent nodeE1 = new NodeAddedSchedulerEvent(node1);
NodeUpdateSchedulerEvent updateE1 = new NodeUpdateSchedulerEvent(node1);
NodeAddedSchedulerEvent nodeE2 = new NodeAddedSchedulerEvent(node2);
NodeUpdateSchedulerEvent updateE2 = new NodeUpdateSchedulerEvent(node2);
NodeAddedSchedulerEvent nodeE3 = new NodeAddedSchedulerEvent(node3);
NodeUpdateSchedulerEvent updateE3 = new NodeUpdateSchedulerEvent(node3);
scheduler.handle(nodeE1);
scheduler.handle(nodeE2);
scheduler.handle(nodeE3);
scheduler.update();
FSLeafQueue queue1 = scheduler.getQueueManager().getLeafQueue("queue1",
true);
Resource amResource1 = Resource.newInstance(1024, 1);
Resource amResource2 = Resource.newInstance(1024, 1);
Resource amResource3 = Resource.newInstance(10240, 1);
Resource amResource4 = Resource.newInstance(5120, 1);
Resource amResource5 = Resource.newInstance(1024, 1);
Resource amResource6 = Resource.newInstance(10240, 1);
Resource amResource7 = Resource.newInstance(1024, 1);
Resource amResource8 = Resource.newInstance(1024, 1);
int amPriority = RMAppAttemptImpl.AM_CONTAINER_PRIORITY.getPriority();
ApplicationAttemptId attId1 = createAppAttemptId(1, 1);
createApplicationWithAMResource(attId1, "queue1", "user1", amResource1);
createSchedulingRequestExistingApplication(1024, 1, amPriority, attId1);
FSAppAttempt app1 = scheduler.getSchedulerApp(attId1);
scheduler.update();
// Allocate app1's AM container on node1.
scheduler.handle(updateE1);
assertEquals("Application1's AM requests 1024 MB memory",
1024, app1.getAMResource().getMemory());
assertEquals("Application1's AM should be running",
1, app1.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 1024 MB memory",
1024, queue1.getAmResourceUsage().getMemory());
ApplicationAttemptId attId2 = createAppAttemptId(2, 1);
createApplicationWithAMResource(attId2, "queue1", "user1", amResource2);
createSchedulingRequestExistingApplication(1024, 1, amPriority, attId2);
FSAppAttempt app2 = scheduler.getSchedulerApp(attId2);
scheduler.update();
// Allocate app2's AM container on node2.
scheduler.handle(updateE2);
assertEquals("Application2's AM requests 1024 MB memory",
1024, app2.getAMResource().getMemory());
assertEquals("Application2's AM should be running",
1, app2.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 2048 MB memory",
2048, queue1.getAmResourceUsage().getMemory());
ApplicationAttemptId attId3 = createAppAttemptId(3, 1);
createApplicationWithAMResource(attId3, "queue1", "user1", amResource3);
createSchedulingRequestExistingApplication(10240, 1, amPriority, attId3);
FSAppAttempt app3 = scheduler.getSchedulerApp(attId3);
scheduler.update();
// app3 reserves a container on node1 because node1's available resource
// is less than app3's AM container resource.
scheduler.handle(updateE1);
// Similarly app3 reserves a container on node2.
scheduler.handle(updateE2);
assertEquals("Application3's AM resource shouldn't be updated",
0, app3.getAMResource().getMemory());
assertEquals("Application3's AM should not be running",
0, app3.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 2048 MB memory",
2048, queue1.getAmResourceUsage().getMemory());
ApplicationAttemptId attId4 = createAppAttemptId(4, 1);
createApplicationWithAMResource(attId4, "queue1", "user1", amResource4);
createSchedulingRequestExistingApplication(5120, 1, amPriority, attId4);
FSAppAttempt app4 = scheduler.getSchedulerApp(attId4);
scheduler.update();
// app4 can't allocate its AM container on node1 because
// app3 already reserved its container on node1.
scheduler.handle(updateE1);
assertEquals("Application4's AM resource shouldn't be updated",
0, app4.getAMResource().getMemory());
assertEquals("Application4's AM should not be running",
0, app4.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 2048 MB memory",
2048, queue1.getAmResourceUsage().getMemory());
scheduler.update();
// Allocate app4's AM container on node3.
scheduler.handle(updateE3);
assertEquals("Application4's AM requests 5120 MB memory",
5120, app4.getAMResource().getMemory());
assertEquals("Application4's AM should be running",
1, app4.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 7168 MB memory",
7168, queue1.getAmResourceUsage().getMemory());
AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
new AppAttemptRemovedSchedulerEvent(attId1,
RMAppAttemptState.FINISHED, false);
// Release app1's AM container on node1.
scheduler.handle(appRemovedEvent1);
assertEquals("Queue1's AM resource usage should be 6144 MB memory",
6144, queue1.getAmResourceUsage().getMemory());
ApplicationAttemptId attId5 = createAppAttemptId(5, 1);
createApplicationWithAMResource(attId5, "queue1", "user1", amResource5);
createSchedulingRequestExistingApplication(1024, 1, amPriority, attId5);
FSAppAttempt app5 = scheduler.getSchedulerApp(attId5);
scheduler.update();
// app5 can allocate its AM container on node1 after
// app3 unreserve its container on node1 due to
// exceeding queue MaxAMShare limit.
scheduler.handle(updateE1);
assertEquals("Application5's AM requests 1024 MB memory",
1024, app5.getAMResource().getMemory());
assertEquals("Application5's AM should be running",
1, app5.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 7168 MB memory",
7168, queue1.getAmResourceUsage().getMemory());
AppAttemptRemovedSchedulerEvent appRemovedEvent3 =
new AppAttemptRemovedSchedulerEvent(attId3,
RMAppAttemptState.FINISHED, false);
// Remove app3.
scheduler.handle(appRemovedEvent3);
assertEquals("Queue1's AM resource usage should be 7168 MB memory",
7168, queue1.getAmResourceUsage().getMemory());
ApplicationAttemptId attId6 = createAppAttemptId(6, 1);
createApplicationWithAMResource(attId6, "queue1", "user1", amResource6);
createSchedulingRequestExistingApplication(10240, 1, amPriority, attId6);
FSAppAttempt app6 = scheduler.getSchedulerApp(attId6);
scheduler.update();
// app6 can't reserve a container on node1 because
// it exceeds queue MaxAMShare limit.
scheduler.handle(updateE1);
assertEquals("Application6's AM resource shouldn't be updated",
0, app6.getAMResource().getMemory());
assertEquals("Application6's AM should not be running",
0, app6.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 7168 MB memory",
7168, queue1.getAmResourceUsage().getMemory());
ApplicationAttemptId attId7 = createAppAttemptId(7, 1);
createApplicationWithAMResource(attId7, "queue1", "user1", amResource7);
createSchedulingRequestExistingApplication(1024, 1, amPriority, attId7);
FSAppAttempt app7 = scheduler.getSchedulerApp(attId7);
scheduler.update();
// Allocate app7's AM container on node1 to prove
// app6 didn't reserve a container on node1.
scheduler.handle(updateE1);
assertEquals("Application7's AM requests 1024 MB memory",
1024, app7.getAMResource().getMemory());
assertEquals("Application7's AM should be running",
1, app7.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 8192 MB memory",
8192, queue1.getAmResourceUsage().getMemory());
AppAttemptRemovedSchedulerEvent appRemovedEvent4 =
new AppAttemptRemovedSchedulerEvent(attId4,
RMAppAttemptState.FINISHED, false);
// Release app4's AM container on node3.
scheduler.handle(appRemovedEvent4);
assertEquals("Queue1's AM resource usage should be 3072 MB memory",
3072, queue1.getAmResourceUsage().getMemory());
AppAttemptRemovedSchedulerEvent appRemovedEvent5 =
new AppAttemptRemovedSchedulerEvent(attId5,
RMAppAttemptState.FINISHED, false);
// Release app5's AM container on node1.
scheduler.handle(appRemovedEvent5);
assertEquals("Queue1's AM resource usage should be 2048 MB memory",
2048, queue1.getAmResourceUsage().getMemory());
scheduler.update();
// app6 reserves a container on node1 because node1's available resource
// is less than app6's AM container resource and
// app6 is not over AMShare limit.
scheduler.handle(updateE1);
// Similarly app6 reserves a container on node2.
scheduler.handle(updateE2);
ApplicationAttemptId attId8 = createAppAttemptId(8, 1);
createApplicationWithAMResource(attId8, "queue1", "user1", amResource8);
createSchedulingRequestExistingApplication(1024, 1, amPriority, attId8);
FSAppAttempt app8 = scheduler.getSchedulerApp(attId8);
scheduler.update();
// app8 can't allocate a container on node1 because
// app6 already reserved a container on node1.
scheduler.handle(updateE1);
assertEquals("Application8's AM resource shouldn't be updated",
0, app8.getAMResource().getMemory());
assertEquals("Application8's AM should not be running",
0, app8.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 2048 MB memory",
2048, queue1.getAmResourceUsage().getMemory());
scheduler.update();
// app8 can't allocate a container on node2 because
// app6 already reserved a container on node2.
scheduler.handle(updateE2);
assertEquals("Application8's AM resource shouldn't be updated",
0, app8.getAMResource().getMemory());
assertEquals("Application8's AM should not be running",
0, app8.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 2048 MB memory",
2048, queue1.getAmResourceUsage().getMemory());
AppAttemptRemovedSchedulerEvent appRemovedEvent2 =
new AppAttemptRemovedSchedulerEvent(attId2,
RMAppAttemptState.FINISHED, false);
// Release app2's AM container on node2.
scheduler.handle(appRemovedEvent2);
assertEquals("Queue1's AM resource usage should be 1024 MB memory",
1024, queue1.getAmResourceUsage().getMemory());
scheduler.update();
// app6 turns the reservation into an allocation on node2.
scheduler.handle(updateE2);
assertEquals("Application6's AM requests 10240 MB memory",
10240, app6.getAMResource().getMemory());
assertEquals("Application6's AM should be running",
1, app6.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 11264 MB memory",
11264, queue1.getAmResourceUsage().getMemory());
scheduler.update();
// app6 unreserve its container on node1 because
// it already got a container on node2.
// Now app8 can allocate its AM container on node1.
scheduler.handle(updateE1);
assertEquals("Application8's AM requests 1024 MB memory",
1024, app8.getAMResource().getMemory());
assertEquals("Application8's AM should be running",
1, app8.getLiveContainers().size());
assertEquals("Queue1's AM resource usage should be 12288 MB memory",
12288, queue1.getAmResourceUsage().getMemory());
}
@Test
public void testMaxRunningAppsHierarchicalQueues() throws Exception {
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);