YARN-3675. FairScheduler: RM quits when node removal races with continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
(cherry picked from commit a8b50e46737c11936ba72c427da69b2365a07aac)
(cherry picked from commit e8ac88d4fe
)
This commit is contained in:
parent
c550825673
commit
c60054743f
|
@ -83,6 +83,7 @@ Release 2.7.1 - UNRELEASED
|
|||
YARN-3493. RM fails to come up with error "Failed to load/recover state"
|
||||
when mem settings are changed. (Jian He via wangda)
|
||||
|
||||
<<<<<<< HEAD
|
||||
YARN-3626. On Windows localized resources are not moved to the front
|
||||
of the classpath when they should be. (Craig Welch via xgong)
|
||||
|
||||
|
@ -118,6 +119,9 @@ Release 2.7.1 - UNRELEASED
|
|||
YARN-3646. Applications are getting stuck some times in case of retry
|
||||
policy forever. (Raju Bairishetti via devaraj)
|
||||
|
||||
YARN-3675. FairScheduler: RM quits when node removal races with
|
||||
continuous-scheduling on the same node. (Anubhav Dhoot via kasha)
|
||||
|
||||
Release 2.7.0 - 2015-04-20
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -1033,13 +1033,23 @@ public class FairScheduler extends
|
|||
nodes.get(n1).getAvailableResource());
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void attemptScheduling(FSSchedulerNode node) {
|
||||
|
||||
@VisibleForTesting
|
||||
synchronized void attemptScheduling(FSSchedulerNode node) {
|
||||
if (rmContext.isWorkPreservingRecoveryEnabled()
|
||||
&& !rmContext.isSchedulerReadyForAllocatingContainers()) {
|
||||
return;
|
||||
}
|
||||
|
||||
final NodeId nodeID = node.getNodeID();
|
||||
if (!nodes.containsKey(nodeID)) {
|
||||
// The node might have just been removed while this thread was waiting
|
||||
// on the synchronized lock before it entered this synchronized method
|
||||
LOG.info("Skipping scheduling as the node " + nodeID +
|
||||
" has been removed");
|
||||
return;
|
||||
}
|
||||
|
||||
// Assign new containers...
|
||||
// 1. Check for reserved applications
|
||||
// 2. Schedule if there are no reservations
|
||||
|
|
|
@ -3949,7 +3949,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testDontAllowUndeclaredPools() throws Exception{
|
||||
public void testDontAllowUndeclaredPools() throws Exception {
|
||||
conf.setBoolean(FairSchedulerConfiguration.ALLOW_UNDECLARED_POOLS, false);
|
||||
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
|
||||
|
||||
|
@ -3965,10 +3965,10 @@ public class TestFairScheduler extends FairSchedulerTestBase {
|
|||
scheduler.start();
|
||||
scheduler.reinitialize(conf, resourceManager.getRMContext());
|
||||
QueueManager queueManager = scheduler.getQueueManager();
|
||||
|
||||
|
||||
FSLeafQueue jerryQueue = queueManager.getLeafQueue("jerry", false);
|
||||
FSLeafQueue defaultQueue = queueManager.getLeafQueue("default", false);
|
||||
|
||||
|
||||
// Should get put into jerry
|
||||
createSchedulingRequest(1024, "jerry", "someuser");
|
||||
assertEquals(1, jerryQueue.getNumRunnableApps());
|
||||
|
@ -3977,19 +3977,63 @@ public class TestFairScheduler extends FairSchedulerTestBase {
|
|||
createSchedulingRequest(1024, "newqueue", "someuser");
|
||||
assertEquals(1, jerryQueue.getNumRunnableApps());
|
||||
assertEquals(1, defaultQueue.getNumRunnableApps());
|
||||
|
||||
|
||||
// Would get put into someuser because of user-as-default-queue, but should
|
||||
// be forced into default
|
||||
createSchedulingRequest(1024, "default", "someuser");
|
||||
assertEquals(1, jerryQueue.getNumRunnableApps());
|
||||
assertEquals(2, defaultQueue.getNumRunnableApps());
|
||||
|
||||
|
||||
// Should get put into jerry because of user-as-default-queue
|
||||
createSchedulingRequest(1024, "default", "jerry");
|
||||
assertEquals(2, jerryQueue.getNumRunnableApps());
|
||||
assertEquals(2, defaultQueue.getNumRunnableApps());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSchedulingOnRemovedNode() throws Exception {
|
||||
// Disable continuous scheduling, will invoke continuous scheduling manually
|
||||
scheduler.init(conf);
|
||||
scheduler.start();
|
||||
Assert.assertTrue("Continuous scheduling should be disabled.",
|
||||
!scheduler.isContinuousSchedulingEnabled());
|
||||
|
||||
ApplicationAttemptId id11 = createAppAttemptId(1, 1);
|
||||
createMockRMApp(id11);
|
||||
|
||||
scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1",
|
||||
false);
|
||||
scheduler.addApplicationAttempt(id11, false, false);
|
||||
|
||||
List<ResourceRequest> ask1 = new ArrayList<>();
|
||||
ResourceRequest request1 =
|
||||
createResourceRequest(1024, 8, ResourceRequest.ANY, 1, 1, true);
|
||||
|
||||
ask1.add(request1);
|
||||
scheduler.allocate(id11, ask1, new ArrayList<ContainerId>(), null,
|
||||
null);
|
||||
|
||||
String hostName = "127.0.0.1";
|
||||
RMNode node1 = MockNodes.newNodeInfo(1,
|
||||
Resources.createResource(8 * 1024, 8), 1, hostName);
|
||||
NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
|
||||
scheduler.handle(nodeEvent1);
|
||||
|
||||
FSSchedulerNode node = (FSSchedulerNode)scheduler.getSchedulerNode(
|
||||
node1.getNodeID());
|
||||
|
||||
NodeRemovedSchedulerEvent removeNode1 =
|
||||
new NodeRemovedSchedulerEvent(node1);
|
||||
scheduler.handle(removeNode1);
|
||||
|
||||
scheduler.attemptScheduling(node);
|
||||
|
||||
AppAttemptRemovedSchedulerEvent appRemovedEvent1 =
|
||||
new AppAttemptRemovedSchedulerEvent(id11,
|
||||
RMAppAttemptState.FINISHED, false);
|
||||
scheduler.handle(appRemovedEvent1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultRuleInitializesProperlyWhenPolicyNotConfigured()
|
||||
throws IOException {
|
||||
|
|
Loading…
Reference in New Issue