YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda)

This commit is contained in:
Wangda Tan 2015-11-12 11:23:40 -08:00
parent 796638d9bc
commit 7f55a18071
4 changed files with 67 additions and 4 deletions

View File

@ -1065,6 +1065,8 @@ Release 2.7.3 - UNRELEASED
BUG FIXES BUG FIXES
YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda)
Release 2.7.2 - UNRELEASED Release 2.7.2 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1382,7 +1382,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|| appState == RMAppState.KILLED; || appState == RMAppState.KILLED;
} }
private RMAppState getRecoveredFinalState() { public RMAppState getRecoveredFinalState() {
return this.recoveredFinalState; return this.recoveredFinalState;
} }

View File

@ -83,6 +83,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent;
@ -1102,6 +1103,9 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
@Override @Override
public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
RMAppAttemptEvent event) { RMAppAttemptEvent event) {
RMApp rmApp = appAttempt.rmContext.getRMApps().get(
appAttempt.getAppAttemptId().getApplicationId());
/* /*
* If last attempt recovered final state is null .. it means attempt was * If last attempt recovered final state is null .. it means attempt was
* started but AM container may or may not have started / finished. * started but AM container may or may not have started / finished.
@ -1109,8 +1113,6 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
*/ */
if (appAttempt.recoveredFinalState != null) { if (appAttempt.recoveredFinalState != null) {
appAttempt.progress = 1.0f; appAttempt.progress = 1.0f;
RMApp rmApp =appAttempt.rmContext.getRMApps().get(
appAttempt.getAppAttemptId().getApplicationId());
if (appAttempt.submissionContext if (appAttempt.submissionContext
.getKeepContainersAcrossApplicationAttempts() .getKeepContainersAcrossApplicationAttempts()
@ -1130,7 +1132,24 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
appAttempt, event); appAttempt, event);
} }
return appAttempt.recoveredFinalState; return appAttempt.recoveredFinalState;
} else { } else if (RMAppImpl.isAppInFinalState(rmApp)) {
// Somehow attempt final state was not saved but app final state was saved.
// Skip adding the attempt into scheduler
RMAppState appState = ((RMAppImpl) rmApp).getRecoveredFinalState();
LOG.warn(rmApp.getApplicationId() + " final state (" + appState
+ ") was recorded, but " + appAttempt.applicationAttemptId
+ " final state (" + appAttempt.recoveredFinalState
+ ") was not recorded.");
switch (appState) {
case FINISHED:
return RMAppAttemptState.FINISHED;
case FAILED:
return RMAppAttemptState.FAILED;
case KILLED:
return RMAppAttemptState.KILLED;
}
return RMAppAttemptState.FAILED;
} else{
// Add the current attempt to the scheduler. // Add the current attempt to the scheduler.
if (appAttempt.rmContext.isWorkPreservingRecoveryEnabled()) { if (appAttempt.rmContext.isWorkPreservingRecoveryEnabled()) {
// Need to register an app attempt before AM can register // Need to register an app attempt before AM can register
@ -1164,6 +1183,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
} }
} }
private void rememberTargetTransitions(RMAppAttemptEvent event, private void rememberTargetTransitions(RMAppAttemptEvent event,
Object transitionToDo, RMAppAttemptState targetFinalState) { Object transitionToDo, RMAppAttemptState targetFinalState) {
transitionTodo = transitionToDo; transitionTodo = transitionToDo;

View File

@ -59,6 +59,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM; import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore; import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData;
import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationSystemTestUtil; import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationSystemTestUtil;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
@ -1367,4 +1368,44 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase
assertEquals(1,loadedAttempt1.getJustFinishedContainers().size()); assertEquals(1,loadedAttempt1.getJustFinishedContainers().size());
} }
// Test that if application state was saved, but attempt state was not saved.
// RM should start correctly.
@Test (timeout = 20000)
public void testAppStateSavedButAttemptStateNotSaved() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override public synchronized void updateApplicationAttemptStateInternal(
ApplicationAttemptId appAttemptId,
ApplicationAttemptStateData attemptState) {
// do nothing;
// simulate the failure that attempt final state is not saved.
}
};
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
ApplicationStateData appSavedState =
memStore.getState().getApplicationState().get(app1.getApplicationId());
// check that app state is saved.
assertEquals(RMAppState.FINISHED, appSavedState.getState());
// check that attempt state is not saved.
assertNull(appSavedState.getAttempt(am1.getApplicationAttemptId()).getState());
rm2 = new MockRM(conf, memStore);
rm2.start();
RMApp recoveredApp1 =
rm2.getRMContext().getRMApps().get(app1.getApplicationId());
assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
// check that attempt state is recovered correctly.
assertEquals(RMAppAttemptState.FINISHED, recoveredApp1.getCurrentAppAttempt().getState());
}
} }