From e8ac88d4fe8aaef9d2e5fb76e6bc50223ff0e495 Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Thu, 21 May 2015 13:38:30 -0700 Subject: [PATCH] YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha) (cherry picked from commit a8b50e46737c11936ba72c427da69b2365a07aac) --- hadoop-yarn-project/CHANGES.txt | 4 ++ .../scheduler/fair/FairScheduler.java | 14 +++++- .../scheduler/fair/TestFairScheduler.java | 44 +++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 12cf92519e6..b7e66d547fe 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -464,6 +464,7 @@ Release 2.7.1 - UNRELEASED YARN-3493. RM fails to come up with error "Failed to load/recover state" when mem settings are changed. (Jian He via wangda) +<<<<<<< HEAD YARN-3626. On Windows localized resources are not moved to the front of the classpath when they should be. (Craig Welch via xgong) @@ -499,6 +500,9 @@ Release 2.7.1 - UNRELEASED YARN-3646. Applications are getting stuck some times in case of retry policy forever. (Raju Bairishetti via devaraj) + YARN-3675. FairScheduler: RM quits when node removal races with + continuous-scheduling on the same node. (Anubhav Dhoot via kasha) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index f481de56c02..07b32714a2a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -1039,13 +1039,23 @@ public class FairScheduler extends nodes.get(n1).getAvailableResource()); } } - - private synchronized void attemptScheduling(FSSchedulerNode node) { + + @VisibleForTesting + synchronized void attemptScheduling(FSSchedulerNode node) { if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext.isSchedulerReadyForAllocatingContainers()) { return; } + final NodeId nodeID = node.getNodeID(); + if (!nodes.containsKey(nodeID)) { + // The node might have just been removed while this thread was waiting + // on the synchronized lock before it entered this synchronized method + LOG.info("Skipping scheduling as the node " + nodeID + + " has been removed"); + return; + } + // Assign new containers... // 1. Check for reserved applications // 2. Schedule if there are no reservations diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index ee20863570e..0e6367d1ec3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -3889,6 +3889,50 @@ public class TestFairScheduler extends FairSchedulerTestBase { } } + @Test + public void testSchedulingOnRemovedNode() throws Exception { + // Disable continuous scheduling, will invoke continuous scheduling manually + scheduler.init(conf); + scheduler.start(); + Assert.assertTrue("Continuous scheduling should be disabled.", + !scheduler.isContinuousSchedulingEnabled()); + + ApplicationAttemptId id11 = createAppAttemptId(1, 1); + createMockRMApp(id11); + + scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1", + false); + scheduler.addApplicationAttempt(id11, false, false); + + List ask1 = new ArrayList<>(); + ResourceRequest request1 = + createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true); + + ask1.add(request1); + scheduler.allocate(id11, ask1, new ArrayList(), null, + null); + + String hostName = "127.0.0.1"; + RMNode node1 = MockNodes.newNodeInfo(1, + Resources.createResource(8 * 1024, 8), 1, hostName); + NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1); + scheduler.handle(nodeEvent1); + + FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode( + node1.getNodeID()); + + NodeRemovedSchedulerEvent removeNode1 = + new NodeRemovedSchedulerEvent(node1); + scheduler.handle(removeNode1); + + scheduler.attemptScheduling(node); + + AppAttemptRemovedSchedulerEvent appRemovedEvent1 = + new AppAttemptRemovedSchedulerEvent(id11, + RMAppAttemptState.FINISHED, false); + scheduler.handle(appRemovedEvent1); + } + @Test public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured() throws IOException {