YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda)
This commit is contained in:
parent
adede3e53d
commit
ddce4c8245
|
@ -18,6 +18,8 @@ Release 2.7.3 - UNRELEASED
|
||||||
YARN-3840. Resource Manager web ui issue when sorting application by id (with
|
YARN-3840. Resource Manager web ui issue when sorting application by id (with
|
||||||
application having id > 9999) (Mohammad Shahid Khan via jianhe)
|
application having id > 9999) (Mohammad Shahid Khan via jianhe)
|
||||||
|
|
||||||
|
YARN-4347. Resource manager fails with Null pointer exception. (Jian He via wangda)
|
||||||
|
|
||||||
Release 2.7.2 - UNRELEASED
|
Release 2.7.2 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -1282,7 +1282,7 @@ public class RMAppImpl implements RMApp, Recoverable {
|
||||||
|| appState == RMAppState.KILLED;
|
|| appState == RMAppState.KILLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
private RMAppState getRecoveredFinalState() {
|
public RMAppState getRecoveredFinalState() {
|
||||||
return this.recoveredFinalState;
|
return this.recoveredFinalState;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,6 +81,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent;
|
||||||
|
@ -1039,6 +1040,9 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
|
||||||
@Override
|
@Override
|
||||||
public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
|
public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
|
||||||
RMAppAttemptEvent event) {
|
RMAppAttemptEvent event) {
|
||||||
|
RMApp rmApp = appAttempt.rmContext.getRMApps().get(
|
||||||
|
appAttempt.getAppAttemptId().getApplicationId());
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If last attempt recovered final state is null .. it means attempt was
|
* If last attempt recovered final state is null .. it means attempt was
|
||||||
* started but AM container may or may not have started / finished.
|
* started but AM container may or may not have started / finished.
|
||||||
|
@ -1046,8 +1050,6 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
|
||||||
*/
|
*/
|
||||||
if (appAttempt.recoveredFinalState != null) {
|
if (appAttempt.recoveredFinalState != null) {
|
||||||
appAttempt.progress = 1.0f;
|
appAttempt.progress = 1.0f;
|
||||||
RMApp rmApp =appAttempt.rmContext.getRMApps().get(
|
|
||||||
appAttempt.getAppAttemptId().getApplicationId());
|
|
||||||
// We will replay the final attempt only if last attempt is in final
|
// We will replay the final attempt only if last attempt is in final
|
||||||
// state but application is not in final state.
|
// state but application is not in final state.
|
||||||
if (rmApp.getCurrentAppAttempt() == appAttempt
|
if (rmApp.getCurrentAppAttempt() == appAttempt
|
||||||
|
@ -1060,7 +1062,24 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
|
||||||
appAttempt, event);
|
appAttempt, event);
|
||||||
}
|
}
|
||||||
return appAttempt.recoveredFinalState;
|
return appAttempt.recoveredFinalState;
|
||||||
} else {
|
} else if (RMAppImpl.isAppInFinalState(rmApp)) {
|
||||||
|
// Somehow attempt final state was not saved but app final state was saved.
|
||||||
|
// Skip adding the attempt into scheduler
|
||||||
|
RMAppState appState = ((RMAppImpl) rmApp).getRecoveredFinalState();
|
||||||
|
LOG.warn(rmApp.getApplicationId() + " final state (" + appState
|
||||||
|
+ ") was recorded, but " + appAttempt.applicationAttemptId
|
||||||
|
+ " final state (" + appAttempt.recoveredFinalState
|
||||||
|
+ ") was not recorded.");
|
||||||
|
switch (appState) {
|
||||||
|
case FINISHED:
|
||||||
|
return RMAppAttemptState.FINISHED;
|
||||||
|
case FAILED:
|
||||||
|
return RMAppAttemptState.FAILED;
|
||||||
|
case KILLED:
|
||||||
|
return RMAppAttemptState.KILLED;
|
||||||
|
}
|
||||||
|
return RMAppAttemptState.FAILED;
|
||||||
|
} else{
|
||||||
// Add the current attempt to the scheduler.
|
// Add the current attempt to the scheduler.
|
||||||
if (appAttempt.rmContext.isWorkPreservingRecoveryEnabled()) {
|
if (appAttempt.rmContext.isWorkPreservingRecoveryEnabled()) {
|
||||||
// Need to register an app attempt before AM can register
|
// Need to register an app attempt before AM can register
|
||||||
|
@ -1094,6 +1113,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void rememberTargetTransitions(RMAppAttemptEvent event,
|
private void rememberTargetTransitions(RMAppAttemptEvent event,
|
||||||
Object transitionToDo, RMAppAttemptState targetFinalState) {
|
Object transitionToDo, RMAppAttemptState targetFinalState) {
|
||||||
transitionTodo = transitionToDo;
|
transitionTodo = transitionToDo;
|
||||||
|
|
|
@ -60,6 +60,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM;
|
import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
||||||
|
@ -1160,4 +1161,45 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase
|
||||||
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
|
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
|
||||||
rm2.start();
|
rm2.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test that if application state was saved, but attempt state was not saved.
|
||||||
|
// RM should start correctly.
|
||||||
|
@Test (timeout = 20000)
|
||||||
|
public void testAppStateSavedButAttemptStateNotSaved() throws Exception {
|
||||||
|
MemoryRMStateStore memStore = new MemoryRMStateStore() {
|
||||||
|
@Override public synchronized void updateApplicationAttemptStateInternal(
|
||||||
|
ApplicationAttemptId appAttemptId,
|
||||||
|
ApplicationAttemptStateData attemptState) {
|
||||||
|
// do nothing;
|
||||||
|
// simulate the failure that attempt final state is not saved.
|
||||||
|
}
|
||||||
|
};
|
||||||
|
memStore.init(conf);
|
||||||
|
rm1 = new MockRM(conf, memStore);
|
||||||
|
rm1.start();
|
||||||
|
|
||||||
|
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
|
||||||
|
nm1.registerNode();
|
||||||
|
|
||||||
|
RMApp app1 = rm1.submitApp(200);
|
||||||
|
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
|
||||||
|
MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
|
||||||
|
|
||||||
|
ApplicationStateData appSavedState =
|
||||||
|
memStore.getState().getApplicationState().get(app1.getApplicationId());
|
||||||
|
|
||||||
|
// check that app state is saved.
|
||||||
|
assertEquals(RMAppState.FINISHED, appSavedState.getState());
|
||||||
|
// check that attempt state is not saved.
|
||||||
|
assertNull(appSavedState.getAttempt(am1.getApplicationAttemptId()).getState());
|
||||||
|
|
||||||
|
rm2 = new MockRM(conf, memStore);
|
||||||
|
rm2.start();
|
||||||
|
RMApp recoveredApp1 =
|
||||||
|
rm2.getRMContext().getRMApps().get(app1.getApplicationId());
|
||||||
|
|
||||||
|
assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
|
||||||
|
// check that attempt state is recovered correctly.
|
||||||
|
assertEquals(RMAppAttemptState.FINISHED, recoveredApp1.getCurrentAppAttempt().getState());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue