YARN-1816. Fixed ResourceManager to get RMApp correctly handle ATTEMPT_FINISHED event at ACCEPTED state that can happen after RM restarts. Contributed by Jian He.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1576911 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2014-03-12 21:18:55 +00:00
parent 4de17c6052
commit 197217c95d
4 changed files with 84 additions and 9 deletions

View File

@ -457,6 +457,10 @@ Release 2.4.0 - UNRELEASED
and thus recover app itself synchronously and avoid races with resyncing and thus recover app itself synchronously and avoid races with resyncing
NodeManagers. (Jian He via vinodkv) NodeManagers. (Jian He via vinodkv)
YARN-1816. Fixed ResourceManager to get RMApp correctly handle
ATTEMPT_FINISHED event at ACCEPTED state that can happen after RM restarts.
(Jian He via vinodkv)
Release 2.3.1 - UNRELEASED Release 2.3.1 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -189,11 +189,14 @@ public class RMAppImpl implements RMApp, Recoverable {
RMAppEventType.ATTEMPT_REGISTERED) RMAppEventType.ATTEMPT_REGISTERED)
.addTransition(RMAppState.ACCEPTED, .addTransition(RMAppState.ACCEPTED,
EnumSet.of(RMAppState.ACCEPTED, RMAppState.FINAL_SAVING), EnumSet.of(RMAppState.ACCEPTED, RMAppState.FINAL_SAVING),
// ACCEPTED state is possible to receive ATTEMPT_FAILED event because // ACCEPTED state is possible to receive ATTEMPT_FAILED/ATTEMPT_FINISHED
// RMAppRecoveredTransition is returning ACCEPTED state directly and // event because RMAppRecoveredTransition is returning ACCEPTED state
// waiting for the previous AM to exit. // directly and waiting for the previous AM to exit.
RMAppEventType.ATTEMPT_FAILED, RMAppEventType.ATTEMPT_FAILED,
new AttemptFailedTransition(RMAppState.ACCEPTED)) new AttemptFailedTransition(RMAppState.ACCEPTED))
.addTransition(RMAppState.ACCEPTED, RMAppState.FINAL_SAVING,
RMAppEventType.ATTEMPT_FINISHED,
new FinalSavingTransition(FINISHED_TRANSITION, RMAppState.FINISHED))
.addTransition(RMAppState.ACCEPTED, RMAppState.KILLING, .addTransition(RMAppState.ACCEPTED, RMAppState.KILLING,
RMAppEventType.KILL, new KillAttemptTransition()) RMAppEventType.KILL, new KillAttemptTransition())
// ACCECPTED state can once again receive APP_ACCEPTED event, because on // ACCECPTED state can once again receive APP_ACCEPTED event, because on
@ -725,11 +728,7 @@ public class RMAppImpl implements RMApp, Recoverable {
@Override @Override
public RMAppState transition(RMAppImpl app, RMAppEvent event) { public RMAppState transition(RMAppImpl app, RMAppEvent event) {
/*
* If last attempt recovered final state is null .. it means attempt was
* started but AM container may or may not have started / finished.
* Therefore we should wait for it to finish.
*/
for (RMAppAttempt attempt : app.getAppAttempts().values()) { for (RMAppAttempt attempt : app.getAppAttempts().values()) {
// synchronously recover attempt to ensure any incoming external events // synchronously recover attempt to ensure any incoming external events
// to be processed after the attempt processes the recover event. // to be processed after the attempt processes the recover event.
@ -744,6 +743,17 @@ public class RMAppImpl implements RMApp, Recoverable {
return app.recoveredFinalState; return app.recoveredFinalState;
} }
// Last attempt is in final state, do not add to scheduler and just return
// ACCEPTED waiting for last RMAppAttempt to send finished or failed event
// back.
if (app.currentAttempt != null
&& (app.currentAttempt.getState() == RMAppAttemptState.KILLED
|| app.currentAttempt.getState() == RMAppAttemptState.FINISHED
|| (app.currentAttempt.getState() == RMAppAttemptState.FAILED
&& app.attempts.size() == app.maxAppAttempts))) {
return RMAppState.ACCEPTED;
}
// Notify scheduler about the app on recovery // Notify scheduler about the app on recovery
new AddApplicationToSchedulerTransition().transition(app, event); new AddApplicationToSchedulerTransition().transition(app, event);

View File

@ -872,6 +872,11 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
@Override @Override
public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
RMAppAttemptEvent event) { RMAppAttemptEvent event) {
/*
* If last attempt recovered final state is null .. it means attempt was
* started but AM container may or may not have started / finished.
* Therefore we should wait for it to finish.
*/
if (appAttempt.recoveredFinalState != null) { if (appAttempt.recoveredFinalState != null) {
appAttempt.progress = 1.0f; appAttempt.progress = 1.0f;
RMApp rmApp =appAttempt.rmContext.getRMApps().get( RMApp rmApp =appAttempt.rmContext.getRMApps().get(
@ -1598,7 +1603,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
ExitUtil.terminate(1, storeEvent.getStoredException()); ExitUtil.terminate(1, storeEvent.getStoredException());
} }
} }
private void storeAttempt() { private void storeAttempt() {
// store attempt data in a non-blocking manner to prevent dispatcher // store attempt data in a non-blocking manner to prevent dispatcher
// thread starvation and wait for state to be saved // thread starvation and wait for state to be saved

View File

@ -601,7 +601,63 @@ public class TestRMRestart {
RMAppAttemptState.SCHEDULED); RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2 Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2
.getCurrentAppAttempt().getAppAttemptState()); .getCurrentAppAttempt().getAppAttemptState());
}
// Test RM restarts after previous attempt succeeded and was saved into state
// store but before the RMAppAttempt notifies RMApp that it has succeeded. On
// recovery, RMAppAttempt should send the AttemptFinished event to RMApp so
// that RMApp can recover its state.
@Test
public void testRMRestartWaitForPreviousSucceededAttempt() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
MemoryRMStateStore memStore = new MemoryRMStateStore() {
int count = 0;
@Override
public void updateApplicationStateInternal(ApplicationId appId,
ApplicationStateDataPBImpl appStateData) throws Exception {
if (count == 0) {
// do nothing; simulate app final state is not saved.
LOG.info(appId + " final state is not saved.");
count++;
} else {
super.updateApplicationStateInternal(appId, appStateData);
}
}
};
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationState> rmAppState =
rmState.getApplicationState();
// start RM
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = rm1.registerNode("127.0.0.1:1234", 15120);
RMApp app0 = rm1.submitApp(200);
MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
FinishApplicationMasterRequest req =
FinishApplicationMasterRequest.newInstance(
FinalApplicationStatus.SUCCEEDED, "", "");
am0.unregisterAppAttempt(req, true);
am0.waitForState(RMAppAttemptState.FINISHING);
// app final state is not saved. This guarantees that RMApp cannot be
// recovered via its own saved state, but only via the event notification
// from the RMAppAttempt on recovery.
Assert.assertNull(rmAppState.get(app0.getApplicationId()).getState());
// start RM
MockRM rm2 = new MockRM(conf, memStore);
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
rm2.start();
rm2.waitForState(app0.getCurrentAppAttempt().getAppAttemptId(),
RMAppAttemptState.FINISHED);
rm2.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
// app final state is saved via the finish event from attempt.
Assert.assertEquals(RMAppState.FINISHED,
rmAppState.get(app0.getApplicationId()).getState());
} }
@Test @Test