From 30ff5bff1aa88545ab5bf56958951ba9525fd8ab Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Mon, 13 Feb 2017 11:26:30 -0800 Subject: [PATCH] YARN-3933. FairScheduler: Multiple calls to completedContainer are not safe. (Shiwei Guo and Miklos Szegedi via kasha) (cherry picked from commit 646c6d6509f515b1373288869fb92807fa2ddc9b) --- .../scheduler/fair/FSAppAttempt.java | 9 ++- .../scheduler/fair/TestFairScheduler.java | 55 ++++++++++++++++++- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index 9333caacfa9..f67aa3c005d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -139,6 +139,13 @@ public class FSAppAttempt extends SchedulerApplicationAttempt Container container = rmContainer.getContainer(); ContainerId containerId = container.getId(); + // Remove from the list of containers + if (liveContainers.remove(containerId) == null) { + LOG.info("Additional complete request on completed container " + + rmContainer.getContainerId()); + return; + } + // Remove from the list of newly allocated containers if found newlyAllocatedContainers.remove(rmContainer); @@ -150,8 +157,6 @@ public class FSAppAttempt extends SchedulerApplicationAttempt + " in state: " + rmContainer.getState() + " event:" + event); } - // Remove from the list of containers - liveContainers.remove(rmContainer.getContainerId()); untrackContainerForPreemption(rmContainer); Resource containerResource = rmContainer.getContainer().getResource(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index cc68b36f291..60d27ab5a88 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -90,12 +90,14 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeResourceUpdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; @@ -3543,7 +3545,58 @@ public class TestFairScheduler extends FairSchedulerTestBase { verifyAppRunnable(attId5, false); verifyQueueNumRunnable("queue1", 2, 1); } - + + @Test + public void testMultipleCompletedEvent() throws Exception { + // Set up a fair scheduler + conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE); + + PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE)); + out.println(""); + out.println(""); + out.println(""); + out.println("0.2"); + out.println(""); + out.println(""); + out.close(); + + scheduler.init(conf); + scheduler.start(); + scheduler.reinitialize(conf, resourceManager.getRMContext()); + + // Create a node + RMNode node = + MockNodes.newNodeInfo(1, Resources.createResource(20480, 20), + 0, "127.0.0.1"); + NodeAddedSchedulerEvent nodeEvent = new NodeAddedSchedulerEvent(node); + NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node); + scheduler.handle(nodeEvent); + scheduler.update(); + + // Launch an app + ApplicationAttemptId attId1 = createAppAttemptId(1, 1); + createApplicationWithAMResource( + attId1, "queue1", "user1", + Resource.newInstance(1024, 1)); + createSchedulingRequestExistingApplication( + 1024, 1, + RMAppAttemptImpl.AM_CONTAINER_PRIORITY.getPriority(), attId1); + FSAppAttempt app1 = scheduler.getSchedulerApp(attId1); + scheduler.update(); + scheduler.handle(updateEvent); + + RMContainer container = app1.getLiveContainersMap(). + values().iterator().next(); + scheduler.completedContainer(container, SchedulerUtils + .createAbnormalContainerStatus(container.getContainerId(), + SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); + scheduler.completedContainer(container, SchedulerUtils + .createAbnormalContainerStatus(container.getContainerId(), + SchedulerUtils.COMPLETED_APPLICATION), + RMContainerEventType.FINISHED); + assertEquals(Resources.none(), app1.getResourceUsage()); + } + @Test public void testQueueMaxAMShare() throws Exception { conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);