YARN-8994. Fix race condition between move app and queue cleanup in Fair Scheduler. (Contributed by Wilfred Spiegelenburg)

This commit is contained in:
Haibo Chen 2018-12-05 14:38:15 -08:00
parent 1b790f4dd1
commit 2bba329c14
1 changed files with 31 additions and 21 deletions

View File

@ -1735,13 +1735,15 @@ public class FairScheduler extends
String queueName) throws YarnException {
writeLock.lock();
try {
// app could have finished between pre check and now
SchedulerApplication<FSAppAttempt> app = applications.get(appId);
if (app == null) {
throw new YarnException("App to be moved " + appId + " not found.");
}
FSAppAttempt attempt = (FSAppAttempt) app.getCurrentAppAttempt();
// To serialize with FairScheduler#allocate, synchronize on app attempt
FSLeafQueue targetQueue = null;
// To serialize with FairScheduler#allocate, synchronize on app attempt
FSAppAttempt attempt = app.getCurrentAppAttempt();
attempt.getWriteLock().lock();
try {
FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
@ -1753,7 +1755,9 @@ public class FairScheduler extends
+ " is stopped and can't be moved!");
}
String destQueueName = handleMoveToPlanQueue(queueName);
FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
// Prevent removal of the queue while the move is in progress by
// registering the app as submitted to the queue.
targetQueue = queueMgr.getLeafQueue(destQueueName, false, appId);
if (targetQueue == null) {
throw new YarnException("Target queue " + queueName
+ " not found or is not a leaf queue.");
@ -1766,9 +1770,14 @@ public class FairScheduler extends
verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
}
// The move itself will clean up the app submit registration.
executeMove(app, attempt, oldQueue, targetQueue);
return targetQueue.getQueueName();
} finally {
// Cleanup the submit registration in case of move failure.
if (targetQueue != null) {
targetQueue.removeAssignedApp(appId);
}
attempt.getWriteLock().unlock();
}
} finally {
@ -1776,6 +1785,17 @@ public class FairScheduler extends
}
}
/**
* Perform pre-checks while moving the application. This should not check any
* application values that can change since the check is not part of an
* atomic action. During a move the scheduler can still assign containers and
* the app can still be updated until the real move is performed under
* proper locking.
*
* @param appId The ID of the app to be moved
* @param newQueue The name of the queue the app should move to
* @throws YarnException if the validate fails
*/
@Override
public void preValidateMoveApplication(ApplicationId appId, String newQueue)
throws YarnException {
@ -1786,24 +1806,14 @@ public class FairScheduler extends
throw new YarnException("App to be moved " + appId + " not found.");
}
FSAppAttempt attempt = app.getCurrentAppAttempt();
// To serialize with FairScheduler#allocate, synchronize on app attempt
attempt.getWriteLock().lock();
try {
FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
String destQueueName = handleMoveToPlanQueue(newQueue);
FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
if (targetQueue == null) {
throw new YarnException("Target queue " + newQueue
+ " not found or is not a leaf queue.");
}
if (oldQueue.isRunnableApp(attempt)) {
verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
}
} finally {
attempt.getWriteLock().unlock();
// Do not register the app on the new queue: lots of things can still
// change between this check and the real move and unregistering the move
// becomes a problem.
String destQueueName = handleMoveToPlanQueue(newQueue);
FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
if (targetQueue == null) {
throw new YarnException("Target queue " + newQueue
+ " not found or is not a leaf queue.");
}
} finally {
writeLock.unlock();