YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha)

(cherry picked from commit a8b50e46737c11936ba72c427da69b2365a07aac)
(cherry picked from commit e8ac88d4fe)
This commit is contained in:
Karthik Kambatla 2015-05-21 13:38:30 -07:00
parent c550825673
commit c60054743f
3 changed files with 65 additions and 7 deletions

View File

@ -83,6 +83,7 @@ Release 2.7.1 - UNRELEASED
YARN-3493. RM fails to come up with error "Failed to load/recover state" YARN-3493. RM fails to come up with error "Failed to load/recover state"
when mem settings are changed. (Jian He via wangda) when mem settings are changed. (Jian He via wangda)
<<<<<<< HEAD
YARN-3626. On Windows localized resources are not moved to the front YARN-3626. On Windows localized resources are not moved to the front
of the classpath when they should be. (Craig Welch via xgong) of the classpath when they should be. (Craig Welch via xgong)
@ -118,6 +119,9 @@ Release 2.7.1 - UNRELEASED
YARN-3646. Applications are getting stuck some times in case of retry YARN-3646. Applications are getting stuck some times in case of retry
policy forever. (Raju Bairishetti via devaraj) policy forever. (Raju Bairishetti via devaraj)
YARN-3675. FairScheduler: RM quits when node removal races with
continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
Release 2.7.0 - 2015-04-20 Release 2.7.0 - 2015-04-20
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1034,12 +1034,22 @@ public class FairScheduler extends
} }
} }
private synchronized void attemptScheduling(FSSchedulerNode node) { @VisibleForTesting
synchronized void attemptScheduling(FSSchedulerNode node) {
if (rmContext.isWorkPreservingRecoveryEnabled() if (rmContext.isWorkPreservingRecoveryEnabled()
&& !rmContext.isSchedulerReadyForAllocatingContainers()) { && !rmContext.isSchedulerReadyForAllocatingContainers()) {
return; return;
} }
final NodeId nodeID = node.getNodeID();
if (!nodes.containsKey(nodeID)) {
// The node might have just been removed while this thread was waiting
// on the synchronized lock before it entered this synchronized method
LOG.info("Skipping scheduling as the node " + nodeID +
" has been removed");
return;
}
// Assign new containers... // Assign new containers...
// 1. Check for reserved applications // 1. Check for reserved applications
// 2. Schedule if there are no reservations // 2. Schedule if there are no reservations

View File

@ -3990,6 +3990,50 @@ public class TestFairScheduler extends FairSchedulerTestBase {
assertEquals(2, defaultQueue.getNumRunnableApps()); assertEquals(2, defaultQueue.getNumRunnableApps());
} }
@Test
public void testSchedulingOnRemovedNode() throws Exception {
// Disable continuous scheduling, will invoke continuous scheduling manually
scheduler.init(conf);
scheduler.start();
Assert.assertTrue("Continuous scheduling should be disabled.",
!scheduler.isContinuousSchedulingEnabled());
ApplicationAttemptId id11 = createAppAttemptId(1, 1);
createMockRMApp(id11);
scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1",
false);
scheduler.addApplicationAttempt(id11, false, false);
List<ResourceRequest> ask1 = new ArrayList<>();
ResourceRequest request1 =
createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true);
ask1.add(request1);
scheduler.allocate(id11, ask1, new ArrayList<ContainerId>(), null,
null);
String hostName = "127.0.0.1";
RMNode node1 = MockNodes.newNodeInfo(1,
Resources.createResource(8 * 1024, 8), 1, hostName);
NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
scheduler.handle(nodeEvent1);
FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode(
node1.getNodeID());
NodeRemovedSchedulerEvent removeNode1 =
new NodeRemovedSchedulerEvent(node1);
scheduler.handle(removeNode1);
scheduler.attemptScheduling(node);
AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
new AppAttemptRemovedSchedulerEvent(id11,
RMAppAttemptState.FINISHED, false);
scheduler.handle(appRemovedEvent1);
}
@Test @Test
public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured() public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
throws IOException { throws IOException {