YARN-7913. Improve error handling when application recovery fails with exception. Contributed by Wilfred Spiegelenburg

This commit is contained in:
Szilard Nemeth 2020-01-20 13:10:32 +01:00
parent 6d52bbbfcf
commit 581072a8f0
3 changed files with 181 additions and 17 deletions

View File

@ -480,27 +480,39 @@ protected void addApplication(ApplicationId applicationId,
try {
// Assign the app to the queue creating and prevent queue delete.
// This will re-create the queue on restore, however this could fail if
// the config was changed.
FSLeafQueue queue = queueMgr.getLeafQueue(queueName, true,
if (queue == null) {
queueName + " is not a leaf queue");
if (!isAppRecovering) {
queueName + " is not a leaf queue");
// app is recovering we do not want to fail the app now as it was there
// before we started the recovery. Add it to the recovery queue:
// dynamic queue directly under root, no ACL needed (auto clean up)
queueName = "root.recovery";
queue = queueMgr.getLeafQueue(queueName, true, applicationId);
// Enforce ACLs: 2nd check, there could be a time laps between the app
// creation in the RMAppManager and getting here. That means we could
// have a configuration change (prevent race condition)
UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(
if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) &&
!queue.hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
String msg = "User " + user + " does not have permission to submit " +
applicationId + " to queue " + queueName;
rejectApplicationWithMessage(applicationId, msg);
// Skip ACL check for recovering applications: they have been accepted
// in the queue already recovery should not break that.
if (!isAppRecovering) {
// Enforce ACLs: 2nd check, there could be a time laps between the app
// creation in the RMAppManager and getting here. That means we could
// have a configuration change (prevent race condition)
UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(
if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) &&
!queue.hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
String msg = "User " + user + " does not have permission to submit "
+ applicationId + " to queue " + queueName;
rejectApplicationWithMessage(applicationId, msg);
RMApp rmApp = rmContext.getRMApps().get(applicationId);
@ -511,7 +523,11 @@ protected void addApplication(ApplicationId applicationId,
" to set queue name on");
if (rmApp != null && rmApp.getAMResourceRequests() != null) {
// when recovering the NMs might not have registered and we could have
// no resources in the queue, the app is already running and has thus
// passed all these checks, skip them now.
if (!isAppRecovering && rmApp != null &&
rmApp.getAMResourceRequests() != null) {
// Resources.fitsIn would always return false when queueMaxShare is 0
// for any resource, but only using Resources.fitsIn is not enough
// is it would return false for such cases when the requested

View File

@ -270,6 +270,30 @@ protected void createSchedulingRequestExistingApplication(
protected ApplicationAttemptId createRecoveringApplication(
Resource amResource, String queueId, String userId) {
ApplicationAttemptId id =
createAppAttemptId(this.APP_ID++, this.ATTEMPT_ID++);
// On restore the app is already created but we need to check the AM
// resource, make sure it is set for test
ResourceRequest amRequest = createResourceRequest(
// cast to int as we're not testing large values so it is safe
(int)amResource.getMemorySize(), amResource.getVirtualCores(),
ResourceRequest.ANY, 1, 1, true);
List<ResourceRequest> amReqs = new ArrayList<>();
createApplicationWithAMResourceInternal(id, queueId, userId, amResource,
// This fakes the placement which is not part of the scheduler anymore
ApplicationPlacementContext placementCtx =
new ApplicationPlacementContext(queueId);
scheduler.addApplication(id.getApplicationId(), queueId, userId, true,
return id;
protected void createApplicationWithAMResource(ApplicationAttemptId attId,
String queue, String user, Resource amResource) {
createApplicationWithAMResourceInternal(attId, queue, user, amResource,

View File

@ -5525,4 +5525,128 @@ private void testSchedulingAllowedToQueueZeroCapacityOfResource(
createSchedulingRequest(memory, vCores, "queueA", "user1", 1, 2);
public void testRestoreToExistingQueue() throws IOException {
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
scheduler.reinitialize(conf, resourceManager.getRMContext());
// no nodes so the resource total should be zero for all queues
// AM is using resources
Resource amResource = Resources.createResource(1024, 1);
// Add the app and the attempt
ApplicationAttemptId appAttemptId = null;
String queueId = "root.parent.queueA";
try {
appAttemptId = createRecoveringApplication(amResource, queueId, "user1");
} catch (Exception e) {
fail("The exception is not expected. Exception message: "
+ e.getMessage());
scheduler.addApplicationAttempt(appAttemptId, false, true);
List<ApplicationAttemptId> appsInQueue =
assertEquals("Number of apps in queue 'root.parent.queueA' should be one!",
1, appsInQueue.size());
appAttemptId = scheduler.getAppsInQueue(queueId).get(0);
assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ " should not be null!", scheduler.getSchedulerApp(appAttemptId));
FSAppAttempt schedulerApp = scheduler.getSchedulerApp(appAttemptId);
assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ " should not be null!", schedulerApp.getAppSchedulingInfo());
assertTrue("There should be no requests accepted", schedulerApp
// Restore an applications with a user that has no access to the queue
try {
appAttemptId = createRecoveringApplication(amResource, queueId,
} catch (Exception e) {
fail("The exception is not expected. Exception message: "
+ e.getMessage());
scheduler.addApplicationAttempt(appAttemptId, false, true);
appsInQueue = scheduler.getAppsInQueue(queueId);
assertEquals("Number of apps in queue 'root.parent.queueA' should be two!",
2, appsInQueue.size());
appAttemptId = scheduler.getAppsInQueue(queueId).get(1);
assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ " should not be null!", scheduler.getSchedulerApp(appAttemptId));
schedulerApp = scheduler.getSchedulerApp(appAttemptId);
assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ " should not be null!", schedulerApp.getAppSchedulingInfo());
assertTrue("There should be no requests accepted", schedulerApp
public void testRestoreToParentQueue() throws IOException {
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
scheduler.reinitialize(conf, resourceManager.getRMContext());
// no nodes so the resource total should be zero for all queues
// AM is using resources
Resource amResource = Resources.createResource(1024, 1);
// Add the app and the attempt, use a queue that is now a parent
ApplicationAttemptId appAttemptId = null;
String queueId = "root.parent";
try {
appAttemptId = createRecoveringApplication(amResource, queueId, "user1");
} catch (Exception e) {
fail("The exception is not expected. Exception message: "
+ e.getMessage());
scheduler.addApplicationAttempt(appAttemptId, false, true);
String recoveredQueue = "root.recovery";
List<ApplicationAttemptId> appsInQueue =
assertEquals("Number of apps in queue 'root.recovery' should be one!",
1, appsInQueue.size());
appAttemptId =
assertNotNull("Scheduler app for appAttemptId " + appAttemptId
+ " should not be null!", scheduler.getSchedulerApp(appAttemptId));
FSAppAttempt schedulerApp = scheduler.getSchedulerApp(appAttemptId);
assertNotNull("Scheduler app queueInfo for appAttemptId " + appAttemptId
+ " should not be null!", schedulerApp.getAppSchedulingInfo());
assertTrue("There should be no requests accepted", schedulerApp
private void generateAllocationFilePercentageResource() {
.addQueue(new AllocationFileQueue.Builder("root")
.aclSubmitApps(" ")
.aclAdministerApps(" ")
.subQueue(new AllocationFileQueue.Builder("parent")
.maxChildResources("memory-mb=15.0%, vcores=15.0%")
.subQueue(new AllocationFileQueue.Builder("queueA")