YARN-5932. Retrospect moveApplicationToQueue in align with YARN-5611. Contributed by Sunil G.
This commit is contained in:
parent
d2656dc5a6
commit
602c998443
|
@ -152,7 +152,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationSyst
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.PlanningException;
|
import org.apache.hadoop.yarn.server.resourcemanager.reservation.exceptions.PlanningException;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppKillByClientEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppKillByClientEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppMoveEvent;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent;
|
||||||
|
@ -174,8 +173,6 @@ import org.apache.hadoop.yarn.util.Records;
|
||||||
import org.apache.hadoop.yarn.util.UTCClock;
|
import org.apache.hadoop.yarn.util.UTCClock;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.util.concurrent.Futures;
|
|
||||||
import com.google.common.util.concurrent.SettableFuture;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The client interface to the Resource Manager. This module handles all the rpc
|
* The client interface to the Resource Manager. This module handles all the rpc
|
||||||
|
@ -1166,23 +1163,19 @@ public class ClientRMService extends AbstractService implements
|
||||||
}
|
}
|
||||||
|
|
||||||
// Moves only allowed when app is in a state that means it is tracked by
|
// Moves only allowed when app is in a state that means it is tracked by
|
||||||
// the scheduler
|
// the scheduler. Introducing SUBMITTED state also to this list as there
|
||||||
if (EnumSet.of(RMAppState.NEW, RMAppState.NEW_SAVING, RMAppState.FAILED,
|
// could be a corner scenario that app may not be in Scheduler in SUBMITTED
|
||||||
RMAppState.FINAL_SAVING, RMAppState.FINISHING, RMAppState.FINISHED,
|
// state.
|
||||||
RMAppState.KILLED, RMAppState.KILLING, RMAppState.FAILED)
|
if (!ACTIVE_APP_STATES.contains(application.getState())) {
|
||||||
.contains(application.getState())) {
|
|
||||||
String msg = "App in " + application.getState() + " state cannot be moved.";
|
String msg = "App in " + application.getState() + " state cannot be moved.";
|
||||||
RMAuditLogger.logFailure(callerUGI.getShortUserName(),
|
RMAuditLogger.logFailure(callerUGI.getShortUserName(),
|
||||||
AuditConstants.MOVE_APP_REQUEST, "UNKNOWN", "ClientRMService", msg);
|
AuditConstants.MOVE_APP_REQUEST, "UNKNOWN", "ClientRMService", msg);
|
||||||
throw new YarnException(msg);
|
throw new YarnException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
SettableFuture<Object> future = SettableFuture.create();
|
|
||||||
this.rmContext.getDispatcher().getEventHandler().handle(
|
|
||||||
new RMAppMoveEvent(applicationId, request.getTargetQueue(), future));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Futures.get(future, YarnException.class);
|
this.rmAppManager.moveApplicationAcrossQueue(applicationId,
|
||||||
|
request.getTargetQueue());
|
||||||
} catch (YarnException ex) {
|
} catch (YarnException ex) {
|
||||||
RMAuditLogger.logFailure(callerUGI.getShortUserName(),
|
RMAuditLogger.logFailure(callerUGI.getShortUserName(),
|
||||||
AuditConstants.MOVE_APP_REQUEST, "UNKNOWN", "ClientRMService",
|
AuditConstants.MOVE_APP_REQUEST, "UNKNOWN", "ClientRMService",
|
||||||
|
|
|
@ -490,17 +490,26 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
|
||||||
ApplicationId applicationId = event.getApplicationId();
|
ApplicationId applicationId = event.getApplicationId();
|
||||||
LOG.debug("RMAppManager processing event for "
|
LOG.debug("RMAppManager processing event for "
|
||||||
+ applicationId + " of type " + event.getType());
|
+ applicationId + " of type " + event.getType());
|
||||||
switch(event.getType()) {
|
switch (event.getType()) {
|
||||||
case APP_COMPLETED:
|
case APP_COMPLETED :
|
||||||
{
|
finishApplication(applicationId);
|
||||||
finishApplication(applicationId);
|
logApplicationSummary(applicationId);
|
||||||
logApplicationSummary(applicationId);
|
checkAppNumCompletedLimit();
|
||||||
checkAppNumCompletedLimit();
|
break;
|
||||||
|
case APP_MOVE :
|
||||||
|
// moveAllApps from scheduler will fire this event for each of
|
||||||
|
// those applications which needed to be moved to a new queue.
|
||||||
|
// Use the standard move application api to do the same.
|
||||||
|
try {
|
||||||
|
moveApplicationAcrossQueue(applicationId,
|
||||||
|
event.getTargetQueueForMove());
|
||||||
|
} catch (YarnException e) {
|
||||||
|
LOG.warn("Move Application has failed: " + e.getMessage());
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default :
|
||||||
LOG.error("Invalid eventtype " + event.getType() + ". Ignoring!");
|
LOG.error("Invalid eventtype " + event.getType() + ". Ignoring!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// transaction method.
|
// transaction method.
|
||||||
|
@ -579,4 +588,87 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
|
||||||
rmContext.getSystemMetricsPublisher().appUpdated(app,
|
rmContext.getSystemMetricsPublisher().appUpdated(app,
|
||||||
System.currentTimeMillis());
|
System.currentTimeMillis());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* moveToQueue will invoke scheduler api to perform move queue operation.
|
||||||
|
*
|
||||||
|
* @param applicationId
|
||||||
|
* Application Id.
|
||||||
|
* @param targetQueue
|
||||||
|
* Target queue to which this app has to be moved.
|
||||||
|
* @throws YarnException
|
||||||
|
* Handle exceptions.
|
||||||
|
*/
|
||||||
|
public void moveApplicationAcrossQueue(ApplicationId applicationId, String targetQueue)
|
||||||
|
throws YarnException {
|
||||||
|
RMApp app = this.rmContext.getRMApps().get(applicationId);
|
||||||
|
|
||||||
|
// Capacity scheduler will directly follow below approach.
|
||||||
|
// 1. Do a pre-validate check to ensure that changes are fine.
|
||||||
|
// 2. Update this information to state-store
|
||||||
|
// 3. Perform real move operation and update in-memory data structures.
|
||||||
|
synchronized (applicationId) {
|
||||||
|
if (app.isAppInCompletedStates()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String sourceQueue = app.getQueue();
|
||||||
|
// 1. pre-validate move application request to check for any access
|
||||||
|
// violations or other errors. If there are any violations, YarnException
|
||||||
|
// will be thrown.
|
||||||
|
rmContext.getScheduler().preValidateMoveApplication(applicationId,
|
||||||
|
targetQueue);
|
||||||
|
|
||||||
|
// 2. Update to state store with new queue and throw exception is failed.
|
||||||
|
updateAppDataToStateStore(targetQueue, app, false);
|
||||||
|
|
||||||
|
// 3. Perform the real move application
|
||||||
|
String queue = "";
|
||||||
|
try {
|
||||||
|
queue = rmContext.getScheduler().moveApplication(applicationId,
|
||||||
|
targetQueue);
|
||||||
|
} catch (YarnException e) {
|
||||||
|
// Revert to source queue since in-memory move has failed. Chances
|
||||||
|
// of this is very rare as we have already done the pre-validation.
|
||||||
|
updateAppDataToStateStore(sourceQueue, app, true);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update in-memory
|
||||||
|
if (queue != null && !queue.isEmpty()) {
|
||||||
|
app.setQueue(queue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rmContext.getSystemMetricsPublisher().appUpdated(app,
|
||||||
|
System.currentTimeMillis());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateAppDataToStateStore(String queue, RMApp app,
|
||||||
|
boolean toSuppressException) throws YarnException {
|
||||||
|
// Create a future object to capture exceptions from StateStore.
|
||||||
|
SettableFuture<Object> future = SettableFuture.create();
|
||||||
|
|
||||||
|
// Update new queue in Submission Context to update to StateStore.
|
||||||
|
app.getApplicationSubmissionContext().setQueue(queue);
|
||||||
|
|
||||||
|
ApplicationStateData appState = ApplicationStateData.newInstance(
|
||||||
|
app.getSubmitTime(), app.getStartTime(),
|
||||||
|
app.getApplicationSubmissionContext(), app.getUser(),
|
||||||
|
app.getCallerContext());
|
||||||
|
appState.setApplicationTimeouts(app.getApplicationTimeouts());
|
||||||
|
rmContext.getStateStore().updateApplicationStateSynchronously(appState,
|
||||||
|
false, future);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Futures.get(future, YarnException.class);
|
||||||
|
} catch (YarnException ex) {
|
||||||
|
if (!toSuppressException) {
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
LOG.error("Statestore update failed for move application '"
|
||||||
|
+ app.getApplicationId() + "' to queue '" + queue
|
||||||
|
+ "' with below exception:" + ex.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,13 +24,24 @@ import org.apache.hadoop.yarn.event.AbstractEvent;
|
||||||
public class RMAppManagerEvent extends AbstractEvent<RMAppManagerEventType> {
|
public class RMAppManagerEvent extends AbstractEvent<RMAppManagerEventType> {
|
||||||
|
|
||||||
private final ApplicationId appId;
|
private final ApplicationId appId;
|
||||||
|
private final String targetQueueForMove;
|
||||||
|
|
||||||
public RMAppManagerEvent(ApplicationId appId, RMAppManagerEventType type) {
|
public RMAppManagerEvent(ApplicationId appId, RMAppManagerEventType type) {
|
||||||
|
this(appId, "", type);
|
||||||
|
}
|
||||||
|
|
||||||
|
public RMAppManagerEvent(ApplicationId appId, String targetQueueForMove,
|
||||||
|
RMAppManagerEventType type) {
|
||||||
super(type);
|
super(type);
|
||||||
this.appId = appId;
|
this.appId = appId;
|
||||||
|
this.targetQueueForMove = targetQueueForMove;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ApplicationId getApplicationId() {
|
public ApplicationId getApplicationId() {
|
||||||
return this.appId;
|
return this.appId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getTargetQueueForMove() {
|
||||||
|
return this.targetQueueForMove;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,5 +19,6 @@
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager;
|
package org.apache.hadoop.yarn.server.resourcemanager;
|
||||||
|
|
||||||
public enum RMAppManagerEventType {
|
public enum RMAppManagerEventType {
|
||||||
APP_COMPLETED
|
APP_COMPLETED,
|
||||||
|
APP_MOVE
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,6 @@ public enum RMAppEventType {
|
||||||
START,
|
START,
|
||||||
RECOVER,
|
RECOVER,
|
||||||
KILL,
|
KILL,
|
||||||
MOVE, // Move app to a new queue
|
|
||||||
|
|
||||||
// Source: Scheduler and RMAppManager
|
// Source: Scheduler and RMAppManager
|
||||||
APP_REJECTED,
|
APP_REJECTED,
|
||||||
|
|
|
@ -71,7 +71,6 @@ import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
import org.apache.hadoop.yarn.event.EventHandler;
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.security.client.ClientToAMTokenIdentifier;
|
import org.apache.hadoop.yarn.security.client.ClientToAMTokenIdentifier;
|
||||||
|
@ -241,14 +240,10 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
RMAppEventType.APP_REJECTED,
|
RMAppEventType.APP_REJECTED,
|
||||||
new FinalSavingTransition(new AppRejectedTransition(),
|
new FinalSavingTransition(new AppRejectedTransition(),
|
||||||
RMAppState.FAILED))
|
RMAppState.FAILED))
|
||||||
.addTransition(RMAppState.NEW_SAVING, RMAppState.NEW_SAVING,
|
|
||||||
RMAppEventType.MOVE, new RMAppMoveTransition())
|
|
||||||
|
|
||||||
// Transitions from SUBMITTED state
|
// Transitions from SUBMITTED state
|
||||||
.addTransition(RMAppState.SUBMITTED, RMAppState.SUBMITTED,
|
.addTransition(RMAppState.SUBMITTED, RMAppState.SUBMITTED,
|
||||||
RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition())
|
RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition())
|
||||||
.addTransition(RMAppState.SUBMITTED, RMAppState.SUBMITTED,
|
|
||||||
RMAppEventType.MOVE, new RMAppMoveTransition())
|
|
||||||
.addTransition(RMAppState.SUBMITTED, RMAppState.FINAL_SAVING,
|
.addTransition(RMAppState.SUBMITTED, RMAppState.FINAL_SAVING,
|
||||||
RMAppEventType.APP_REJECTED,
|
RMAppEventType.APP_REJECTED,
|
||||||
new FinalSavingTransition(
|
new FinalSavingTransition(
|
||||||
|
@ -263,8 +258,6 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
// Transitions from ACCEPTED state
|
// Transitions from ACCEPTED state
|
||||||
.addTransition(RMAppState.ACCEPTED, RMAppState.ACCEPTED,
|
.addTransition(RMAppState.ACCEPTED, RMAppState.ACCEPTED,
|
||||||
RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition())
|
RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition())
|
||||||
.addTransition(RMAppState.ACCEPTED, RMAppState.ACCEPTED,
|
|
||||||
RMAppEventType.MOVE, new RMAppMoveTransition())
|
|
||||||
.addTransition(RMAppState.ACCEPTED, RMAppState.RUNNING,
|
.addTransition(RMAppState.ACCEPTED, RMAppState.RUNNING,
|
||||||
RMAppEventType.ATTEMPT_REGISTERED, new RMAppStateUpdateTransition(
|
RMAppEventType.ATTEMPT_REGISTERED, new RMAppStateUpdateTransition(
|
||||||
YarnApplicationState.RUNNING))
|
YarnApplicationState.RUNNING))
|
||||||
|
@ -290,8 +283,6 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
// Transitions from RUNNING state
|
// Transitions from RUNNING state
|
||||||
.addTransition(RMAppState.RUNNING, RMAppState.RUNNING,
|
.addTransition(RMAppState.RUNNING, RMAppState.RUNNING,
|
||||||
RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition())
|
RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition())
|
||||||
.addTransition(RMAppState.RUNNING, RMAppState.RUNNING,
|
|
||||||
RMAppEventType.MOVE, new RMAppMoveTransition())
|
|
||||||
.addTransition(RMAppState.RUNNING, RMAppState.FINAL_SAVING,
|
.addTransition(RMAppState.RUNNING, RMAppState.FINAL_SAVING,
|
||||||
RMAppEventType.ATTEMPT_UNREGISTERED,
|
RMAppEventType.ATTEMPT_UNREGISTERED,
|
||||||
new FinalSavingTransition(
|
new FinalSavingTransition(
|
||||||
|
@ -324,7 +315,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
// ignorable transitions
|
// ignorable transitions
|
||||||
.addTransition(RMAppState.FINAL_SAVING, RMAppState.FINAL_SAVING,
|
.addTransition(RMAppState.FINAL_SAVING, RMAppState.FINAL_SAVING,
|
||||||
EnumSet.of(RMAppEventType.NODE_UPDATE, RMAppEventType.KILL,
|
EnumSet.of(RMAppEventType.NODE_UPDATE, RMAppEventType.KILL,
|
||||||
RMAppEventType.APP_NEW_SAVED, RMAppEventType.MOVE))
|
RMAppEventType.APP_NEW_SAVED))
|
||||||
|
|
||||||
// Transitions from FINISHING state
|
// Transitions from FINISHING state
|
||||||
.addTransition(RMAppState.FINISHING, RMAppState.FINISHED,
|
.addTransition(RMAppState.FINISHING, RMAppState.FINISHED,
|
||||||
|
@ -337,7 +328,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
EnumSet.of(RMAppEventType.NODE_UPDATE,
|
EnumSet.of(RMAppEventType.NODE_UPDATE,
|
||||||
// ignore Kill/Move as we have already saved the final Finished state
|
// ignore Kill/Move as we have already saved the final Finished state
|
||||||
// in state store.
|
// in state store.
|
||||||
RMAppEventType.KILL, RMAppEventType.MOVE))
|
RMAppEventType.KILL))
|
||||||
|
|
||||||
// Transitions from KILLING state
|
// Transitions from KILLING state
|
||||||
.addTransition(RMAppState.KILLING, RMAppState.KILLING,
|
.addTransition(RMAppState.KILLING, RMAppState.KILLING,
|
||||||
|
@ -365,7 +356,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
RMAppEventType.NODE_UPDATE,
|
RMAppEventType.NODE_UPDATE,
|
||||||
RMAppEventType.ATTEMPT_REGISTERED,
|
RMAppEventType.ATTEMPT_REGISTERED,
|
||||||
RMAppEventType.APP_UPDATE_SAVED,
|
RMAppEventType.APP_UPDATE_SAVED,
|
||||||
RMAppEventType.KILL, RMAppEventType.MOVE))
|
RMAppEventType.KILL))
|
||||||
|
|
||||||
// Transitions from FINISHED state
|
// Transitions from FINISHED state
|
||||||
// ignorable transitions
|
// ignorable transitions
|
||||||
|
@ -377,7 +368,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
RMAppEventType.NODE_UPDATE,
|
RMAppEventType.NODE_UPDATE,
|
||||||
RMAppEventType.ATTEMPT_UNREGISTERED,
|
RMAppEventType.ATTEMPT_UNREGISTERED,
|
||||||
RMAppEventType.ATTEMPT_FINISHED,
|
RMAppEventType.ATTEMPT_FINISHED,
|
||||||
RMAppEventType.KILL, RMAppEventType.MOVE))
|
RMAppEventType.KILL))
|
||||||
|
|
||||||
// Transitions from FAILED state
|
// Transitions from FAILED state
|
||||||
// ignorable transitions
|
// ignorable transitions
|
||||||
|
@ -385,8 +376,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
RMAppEventType.APP_RUNNING_ON_NODE,
|
RMAppEventType.APP_RUNNING_ON_NODE,
|
||||||
new AppRunningOnNodeTransition())
|
new AppRunningOnNodeTransition())
|
||||||
.addTransition(RMAppState.FAILED, RMAppState.FAILED,
|
.addTransition(RMAppState.FAILED, RMAppState.FAILED,
|
||||||
EnumSet.of(RMAppEventType.KILL, RMAppEventType.NODE_UPDATE,
|
EnumSet.of(RMAppEventType.KILL, RMAppEventType.NODE_UPDATE))
|
||||||
RMAppEventType.MOVE))
|
|
||||||
|
|
||||||
// Transitions from KILLED state
|
// Transitions from KILLED state
|
||||||
// ignorable transitions
|
// ignorable transitions
|
||||||
|
@ -399,7 +389,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
EnumSet.of(RMAppEventType.APP_ACCEPTED,
|
EnumSet.of(RMAppEventType.APP_ACCEPTED,
|
||||||
RMAppEventType.APP_REJECTED, RMAppEventType.KILL,
|
RMAppEventType.APP_REJECTED, RMAppEventType.KILL,
|
||||||
RMAppEventType.ATTEMPT_FINISHED, RMAppEventType.ATTEMPT_FAILED,
|
RMAppEventType.ATTEMPT_FINISHED, RMAppEventType.ATTEMPT_FAILED,
|
||||||
RMAppEventType.NODE_UPDATE, RMAppEventType.MOVE))
|
RMAppEventType.NODE_UPDATE))
|
||||||
|
|
||||||
.installTopology();
|
.installTopology();
|
||||||
|
|
||||||
|
@ -992,32 +982,6 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Move an app to a new queue.
|
|
||||||
* This transition must set the result on the Future in the RMAppMoveEvent,
|
|
||||||
* either as an exception for failure or null for success, or the client will
|
|
||||||
* be left waiting forever.
|
|
||||||
*/
|
|
||||||
private static final class RMAppMoveTransition extends RMAppTransition {
|
|
||||||
public void transition(RMAppImpl app, RMAppEvent event) {
|
|
||||||
RMAppMoveEvent moveEvent = (RMAppMoveEvent) event;
|
|
||||||
try {
|
|
||||||
app.queue = app.scheduler.moveApplication(app.applicationId,
|
|
||||||
moveEvent.getTargetQueue());
|
|
||||||
} catch (YarnException ex) {
|
|
||||||
moveEvent.getResult().setException(ex);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
app.rmContext.getSystemMetricsPublisher().appUpdated(app,
|
|
||||||
app.systemClock.getTime());
|
|
||||||
|
|
||||||
// TODO: Write out change to state store (YARN-1558)
|
|
||||||
// Also take care of RM failover
|
|
||||||
moveEvent.getResult().set(null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// synchronously recover attempt to ensure any incoming external events
|
// synchronously recover attempt to ensure any incoming external events
|
||||||
// to be processed after the attempt processes the recover event.
|
// to be processed after the attempt processes the recover event.
|
||||||
private void recoverAppAttempts() {
|
private void recoverAppAttempts() {
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.rmapp;
|
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
|
||||||
|
|
||||||
import com.google.common.util.concurrent.SettableFuture;
|
|
||||||
|
|
||||||
public class RMAppMoveEvent extends RMAppEvent {
|
|
||||||
private String targetQueue;
|
|
||||||
private SettableFuture<Object> result;
|
|
||||||
|
|
||||||
public RMAppMoveEvent(ApplicationId id, String newQueue,
|
|
||||||
SettableFuture<Object> resultFuture) {
|
|
||||||
super(id, RMAppEventType.MOVE);
|
|
||||||
this.targetQueue = newQueue;
|
|
||||||
this.result = resultFuture;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTargetQueue() {
|
|
||||||
return targetQueue;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SettableFuture<Object> getResult() {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -56,6 +56,8 @@ import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
|
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
|
||||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
|
@ -63,7 +65,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppMoveEvent;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
|
@ -359,6 +360,13 @@ public abstract class AbstractYarnScheduler
|
||||||
+ " does not support moving apps between queues");
|
+ " does not support moving apps between queues");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void preValidateMoveApplication(ApplicationId appId,
|
||||||
|
String newQueue) throws YarnException {
|
||||||
|
throw new YarnException(getClass().getSimpleName()
|
||||||
|
+ " does not support pre-validation of moving apps between queues");
|
||||||
|
}
|
||||||
|
|
||||||
public void removeQueue(String queueName) throws YarnException {
|
public void removeQueue(String queueName) throws YarnException {
|
||||||
throw new YarnException(getClass().getSimpleName()
|
throw new YarnException(getClass().getSimpleName()
|
||||||
+ " does not support removing queues");
|
+ " does not support removing queues");
|
||||||
|
@ -672,10 +680,10 @@ public abstract class AbstractYarnScheduler
|
||||||
throw new YarnException(errMsg);
|
throw new YarnException(errMsg);
|
||||||
}
|
}
|
||||||
// generate move events for each pending/running app
|
// generate move events for each pending/running app
|
||||||
for (ApplicationAttemptId app : apps) {
|
for (ApplicationAttemptId appAttemptId : apps) {
|
||||||
SettableFuture<Object> future = SettableFuture.create();
|
this.rmContext.getDispatcher().getEventHandler()
|
||||||
this.rmContext.getDispatcher().getEventHandler().handle(
|
.handle(new RMAppManagerEvent(appAttemptId.getApplicationId(),
|
||||||
new RMAppMoveEvent(app.getApplicationId(), destQueue, future));
|
destQueue, RMAppManagerEventType.APP_MOVE));
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
writeLock.unlock();
|
writeLock.unlock();
|
||||||
|
|
|
@ -229,6 +229,17 @@ public interface YarnScheduler extends EventHandler<SchedulerEvent> {
|
||||||
public String moveApplication(ApplicationId appId, String newQueue)
|
public String moveApplication(ApplicationId appId, String newQueue)
|
||||||
throws YarnException;
|
throws YarnException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param appId Application ID
|
||||||
|
* @param newQueue Target QueueName
|
||||||
|
* @throws YarnException if the pre-validation for move cannot be carried out
|
||||||
|
*/
|
||||||
|
@LimitedPrivate("yarn")
|
||||||
|
@Evolving
|
||||||
|
public void preValidateMoveApplication(ApplicationId appId,
|
||||||
|
String newQueue) throws YarnException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Completely drain sourceQueue of applications, by moving all of them to
|
* Completely drain sourceQueue of applications, by moving all of them to
|
||||||
* destQueue.
|
* destQueue.
|
||||||
|
|
|
@ -32,8 +32,10 @@ import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
import org.apache.hadoop.ipc.Server;
|
import org.apache.hadoop.ipc.Server;
|
||||||
|
import org.apache.hadoop.security.AccessControlException;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.security.authorize.AccessControlList;
|
import org.apache.hadoop.security.authorize.AccessControlList;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.Priority;
|
import org.apache.hadoop.yarn.api.records.Priority;
|
||||||
import org.apache.hadoop.yarn.api.records.QueueACL;
|
import org.apache.hadoop.yarn.api.records.QueueACL;
|
||||||
import org.apache.hadoop.yarn.api.records.QueueInfo;
|
import org.apache.hadoop.yarn.api.records.QueueInfo;
|
||||||
|
@ -67,6 +69,7 @@ import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
public abstract class AbstractCSQueue implements CSQueue {
|
public abstract class AbstractCSQueue implements CSQueue {
|
||||||
|
|
||||||
private static final Log LOG = LogFactory.getLog(AbstractCSQueue.class);
|
private static final Log LOG = LogFactory.getLog(AbstractCSQueue.class);
|
||||||
volatile CSQueue parent;
|
volatile CSQueue parent;
|
||||||
final String queueName;
|
final String queueName;
|
||||||
|
@ -837,4 +840,10 @@ public abstract class AbstractCSQueue implements CSQueue {
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void validateSubmitApplication(ApplicationId applicationId,
|
||||||
|
String userName, String queue) throws AccessControlException {
|
||||||
|
// Dummy implementation
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -362,4 +362,14 @@ extends org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue {
|
||||||
* @return readLock of corresponding queue.
|
* @return readLock of corresponding queue.
|
||||||
*/
|
*/
|
||||||
public ReentrantReadWriteLock.ReadLock getReadLock();
|
public ReentrantReadWriteLock.ReadLock getReadLock();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate submitApplication api so that moveApplication do a pre-check.
|
||||||
|
* @param applicationId Application ID
|
||||||
|
* @param userName User Name
|
||||||
|
* @param queue Queue Name
|
||||||
|
* @throws AccessControlException if any acl violation is there.
|
||||||
|
*/
|
||||||
|
public void validateSubmitApplication(ApplicationId applicationId,
|
||||||
|
String userName, String queue) throws AccessControlException;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2050,9 +2050,8 @@ public class CapacityScheduler extends
|
||||||
sourceQueueName);
|
sourceQueueName);
|
||||||
String destQueueName = handleMoveToPlanQueue(targetQueueName);
|
String destQueueName = handleMoveToPlanQueue(targetQueueName);
|
||||||
LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
|
LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
|
||||||
// Validation check - ACLs, submission limits for user & queue
|
|
||||||
String user = app.getUser();
|
String user = app.getUser();
|
||||||
checkQueuePartition(app, dest);
|
|
||||||
try {
|
try {
|
||||||
dest.submitApplication(appId, user, destQueueName);
|
dest.submitApplication(appId, user, destQueueName);
|
||||||
} catch (AccessControlException e) {
|
} catch (AccessControlException e) {
|
||||||
|
@ -2080,6 +2079,30 @@ public class CapacityScheduler extends
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void preValidateMoveApplication(ApplicationId appId, String newQueue)
|
||||||
|
throws YarnException {
|
||||||
|
try {
|
||||||
|
writeLock.lock();
|
||||||
|
FiCaSchedulerApp app = getApplicationAttempt(
|
||||||
|
ApplicationAttemptId.newInstance(appId, 0));
|
||||||
|
String sourceQueueName = app.getQueue().getQueueName();
|
||||||
|
this.queueManager.getAndCheckLeafQueue(sourceQueueName);
|
||||||
|
String destQueueName = handleMoveToPlanQueue(newQueue);
|
||||||
|
LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName);
|
||||||
|
// Validation check - ACLs, submission limits for user & queue
|
||||||
|
String user = app.getUser();
|
||||||
|
checkQueuePartition(app, dest);
|
||||||
|
try {
|
||||||
|
dest.validateSubmitApplication(appId, user, destQueueName);
|
||||||
|
} catch (AccessControlException e) {
|
||||||
|
throw new YarnException(e);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
writeLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check application can be moved to queue with labels enabled. All labels in
|
* Check application can be moved to queue with labels enabled. All labels in
|
||||||
* application life time will be checked
|
* application life time will be checked
|
||||||
|
|
|
@ -573,6 +573,21 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
public void submitApplication(ApplicationId applicationId, String userName,
|
public void submitApplication(ApplicationId applicationId, String userName,
|
||||||
String queue) throws AccessControlException {
|
String queue) throws AccessControlException {
|
||||||
// Careful! Locking order is important!
|
// Careful! Locking order is important!
|
||||||
|
validateSubmitApplication(applicationId, userName, queue);
|
||||||
|
|
||||||
|
// Inform the parent queue
|
||||||
|
try {
|
||||||
|
getParent().submitApplication(applicationId, userName, queue);
|
||||||
|
} catch (AccessControlException ace) {
|
||||||
|
LOG.info("Failed to submit application to parent-queue: " +
|
||||||
|
getParent().getQueuePath(), ace);
|
||||||
|
throw ace;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void validateSubmitApplication(ApplicationId applicationId,
|
||||||
|
String userName, String queue) throws AccessControlException {
|
||||||
try {
|
try {
|
||||||
writeLock.lock();
|
writeLock.lock();
|
||||||
// Check if the queue is accepting jobs
|
// Check if the queue is accepting jobs
|
||||||
|
@ -607,15 +622,13 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
writeLock.unlock();
|
writeLock.unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Inform the parent queue
|
|
||||||
try {
|
try {
|
||||||
getParent().submitApplication(applicationId, userName, queue);
|
getParent().validateSubmitApplication(applicationId, userName, queue);
|
||||||
} catch (AccessControlException ace) {
|
} catch (AccessControlException ace) {
|
||||||
LOG.info("Failed to submit application to parent-queue: " +
|
LOG.info("Failed to submit application to parent-queue: " +
|
||||||
getParent().getQueuePath(), ace);
|
getParent().getQueuePath(), ace);
|
||||||
throw ace;
|
throw ace;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Resource getAMResourceLimit() {
|
public Resource getAMResourceLimit() {
|
||||||
|
|
|
@ -340,16 +340,7 @@ public class ParentQueue extends AbstractCSQueue {
|
||||||
try {
|
try {
|
||||||
writeLock.lock();
|
writeLock.lock();
|
||||||
// Sanity check
|
// Sanity check
|
||||||
if (queue.equals(queueName)) {
|
validateSubmitApplication(applicationId, user, queue);
|
||||||
throw new AccessControlException(
|
|
||||||
"Cannot submit application " + "to non-leaf queue: " + queueName);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (state != QueueState.RUNNING) {
|
|
||||||
throw new AccessControlException("Queue " + getQueuePath()
|
|
||||||
+ " is STOPPED. Cannot accept submission of application: "
|
|
||||||
+ applicationId);
|
|
||||||
}
|
|
||||||
|
|
||||||
addApplication(applicationId, user);
|
addApplication(applicationId, user);
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -369,6 +360,24 @@ public class ParentQueue extends AbstractCSQueue {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void validateSubmitApplication(ApplicationId applicationId,
|
||||||
|
String userName, String queue) throws AccessControlException {
|
||||||
|
try {
|
||||||
|
writeLock.lock();
|
||||||
|
if (queue.equals(queueName)) {
|
||||||
|
throw new AccessControlException(
|
||||||
|
"Cannot submit application " + "to non-leaf queue: " + queueName);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state != QueueState.RUNNING) {
|
||||||
|
throw new AccessControlException("Queue " + getQueuePath()
|
||||||
|
+ " is STOPPED. Cannot accept submission of application: "
|
||||||
|
+ applicationId);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
writeLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void submitApplicationAttempt(FiCaSchedulerApp application,
|
public void submitApplicationAttempt(FiCaSchedulerApp application,
|
||||||
|
|
|
@ -1697,6 +1697,40 @@ public class FairScheduler extends
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void preValidateMoveApplication(ApplicationId appId, String newQueue)
|
||||||
|
throws YarnException {
|
||||||
|
try {
|
||||||
|
writeLock.lock();
|
||||||
|
SchedulerApplication<FSAppAttempt> app = applications.get(appId);
|
||||||
|
if (app == null) {
|
||||||
|
throw new YarnException("App to be moved " + appId + " not found.");
|
||||||
|
}
|
||||||
|
|
||||||
|
FSAppAttempt attempt = app.getCurrentAppAttempt();
|
||||||
|
// To serialize with FairScheduler#allocate, synchronize on app attempt
|
||||||
|
|
||||||
|
try {
|
||||||
|
attempt.getWriteLock().lock();
|
||||||
|
FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
|
||||||
|
String destQueueName = handleMoveToPlanQueue(newQueue);
|
||||||
|
FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
|
||||||
|
if (targetQueue == null) {
|
||||||
|
throw new YarnException("Target queue " + newQueue
|
||||||
|
+ " not found or is not a leaf queue.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oldQueue.isRunnableApp(attempt)) {
|
||||||
|
verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
attempt.getWriteLock().unlock();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
writeLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void verifyMoveDoesNotViolateConstraints(FSAppAttempt app,
|
private void verifyMoveDoesNotViolateConstraints(FSAppAttempt app,
|
||||||
FSLeafQueue oldQueue, FSLeafQueue targetQueue) throws YarnException {
|
FSLeafQueue oldQueue, FSLeafQueue targetQueue) throws YarnException {
|
||||||
String queueName = targetQueue.getQueueName();
|
String queueName = targetQueue.getQueueName();
|
||||||
|
|
|
@ -87,7 +87,7 @@ public class TestMoveApplication {
|
||||||
application.getApplicationId(), "newqueue"));
|
application.getApplicationId(), "newqueue"));
|
||||||
fail("Should have hit exception");
|
fail("Should have hit exception");
|
||||||
} catch (YarnException ex) {
|
} catch (YarnException ex) {
|
||||||
assertEquals("Move not supported", ex.getCause().getMessage());
|
assertEquals("Move not supported", ex.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,5 +178,13 @@ public class TestMoveApplication {
|
||||||
QueueACL acl, String queueName) {
|
QueueACL acl, String queueName) {
|
||||||
return acl != QueueACL.ADMINISTER_QUEUE;
|
return acl != QueueACL.ADMINISTER_QUEUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void preValidateMoveApplication(ApplicationId appId, String newQueue)
|
||||||
|
throws YarnException {
|
||||||
|
if (failMove) {
|
||||||
|
throw new YarnException("Move not supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -457,6 +457,7 @@ public class TestCapacitySchedulerNodeLabelUpdate {
|
||||||
CapacityScheduler scheduler =
|
CapacityScheduler scheduler =
|
||||||
((CapacityScheduler) rm.getResourceScheduler());
|
((CapacityScheduler) rm.getResourceScheduler());
|
||||||
try {
|
try {
|
||||||
|
scheduler.preValidateMoveApplication(app1.getApplicationId(), "a2");
|
||||||
scheduler.moveApplication(app1.getApplicationId(), "a2");
|
scheduler.moveApplication(app1.getApplicationId(), "a2");
|
||||||
fail("Should throw exception since target queue doesnt have "
|
fail("Should throw exception since target queue doesnt have "
|
||||||
+ "required labels");
|
+ "required labels");
|
||||||
|
|
Loading…
Reference in New Issue