From c60054743f2ffcb2eb9d2c4a3aca8c837ee7339b Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Thu, 21 May 2015 13:38:30 -0700 Subject: [PATCH] YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha) (cherry picked from commit a8b50e46737c11936ba72c427da69b2365a07aac) (cherry picked from commit e8ac88d4fe8aaef9d2e5fb76e6bc50223ff0e495) --- hadoop-yarn-project/CHANGES.txt | 4 ++ .../scheduler/fair/FairScheduler.java | 14 ++++- .../scheduler/fair/TestFairScheduler.java | 54 +++++++++++++++++-- 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 6b95c410ad6..92f8c3fa8ec 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -83,6 +83,7 @@ Release 2.7.1 - UNRELEASED YARN-3493. RM fails to come up with error "Failed to load/recover state" when mem settings are changed. (Jian He via wangda) +<<<<<<< HEAD YARN-3626. On Windows localized resources are not moved to the front of the classpath when they should be. (Craig Welch via xgong) @@ -118,6 +119,9 @@ Release 2.7.1 - UNRELEASED YARN-3646. Applications are getting stuck some times in case of retry policy forever. (Raju Bairishetti via devaraj) + YARN-3675. FairScheduler: RM quits when node removal races with + continuous-scheduling on the same node. (Anubhav Dhoot via kasha) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 1d97983778d..5fef8173e0f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -1033,13 +1033,23 @@ public class FairScheduler extends nodes.get(n1).getAvailableResource()); } } - - private synchronized void attemptScheduling(FSSchedulerNode node) { + + @VisibleForTesting + synchronized void attemptScheduling(FSSchedulerNode node) { if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext.isSchedulerReadyForAllocatingContainers()) { return; } + final NodeId nodeID = node.getNodeID(); + if (!nodes.containsKey(nodeID)) { + // The node might have just been removed while this thread was waiting + // on the synchronized lock before it entered this synchronized method + LOG.info("Skipping scheduling as the node " + nodeID + + " has been removed"); + return; + } + // Assign new containers... // 1. Check for reserved applications // 2. Schedule if there are no reservations diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index 6bc5379fd86..0e114e1e520 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -3949,7 +3949,7 @@ public class TestFairScheduler extends FairSchedulerTestBase { } @Test - public void testDontAllowUndeclaredPools() throws Exception{ + public void testDontAllowUndeclaredPools() throws Exception { conf.setBoolean(FairSchedulerConfiguration.ALLOW_UNDECLARED_POOLS, false); conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE); @@ -3965,10 +3965,10 @@ public class TestFairScheduler extends FairSchedulerTestBase { scheduler.start(); scheduler.reinitialize(conf, resourceManager.getRMContext()); QueueManager queueManager = scheduler.getQueueManager(); - + FSLeafQueue jerryQueue = queueManager.getLeafQueue("jerry", false); FSLeafQueue defaultQueue = queueManager.getLeafQueue("default", false); - + // Should get put into jerry createSchedulingRequest(1024, "jerry", "someuser"); assertEquals(1, jerryQueue.getNumRunnableApps()); @@ -3977,19 +3977,63 @@ public class TestFairScheduler extends FairSchedulerTestBase { createSchedulingRequest(1024, "newqueue", "someuser"); assertEquals(1, jerryQueue.getNumRunnableApps()); assertEquals(1, defaultQueue.getNumRunnableApps()); - + // Would get put into someuser because of user-as-default-queue, but should // be forced into default createSchedulingRequest(1024, "default", "someuser"); assertEquals(1, jerryQueue.getNumRunnableApps()); assertEquals(2, defaultQueue.getNumRunnableApps()); - + // Should get put into jerry because of user-as-default-queue createSchedulingRequest(1024, "default", "jerry"); assertEquals(2, jerryQueue.getNumRunnableApps()); assertEquals(2, defaultQueue.getNumRunnableApps()); } + @Test + public void testSchedulingOnRemovedNode() throws Exception { + // Disable continuous scheduling, will invoke continuous scheduling manually + scheduler.init(conf); + scheduler.start(); + Assert.assertTrue("Continuous scheduling should be disabled.", + !scheduler.isContinuousSchedulingEnabled()); + + ApplicationAttemptId id11 = createAppAttemptId(1, 1); + createMockRMApp(id11); + + scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1", + false); + scheduler.addApplicationAttempt(id11, false, false); + + List ask1 = new ArrayList<>(); + ResourceRequest request1 = + createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true); + + ask1.add(request1); + scheduler.allocate(id11, ask1, new ArrayList(), null, + null); + + String hostName = "127.0.0.1"; + RMNode node1 = MockNodes.newNodeInfo(1, + Resources.createResource(8 * 1024, 8), 1, hostName); + NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1); + scheduler.handle(nodeEvent1); + + FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode( + node1.getNodeID()); + + NodeRemovedSchedulerEvent removeNode1 = + new NodeRemovedSchedulerEvent(node1); + scheduler.handle(removeNode1); + + scheduler.attemptScheduling(node); + + AppAttemptRemovedSchedulerEvent appRemovedEvent1 = + new AppAttemptRemovedSchedulerEvent(id11, + RMAppAttemptState.FINISHED, false); + scheduler.handle(appRemovedEvent1); + } + @Test public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured() throws IOException {