YARN-2459. RM crashes if App gets rejected for any reason and HA is enabled. Contributed by Jian He

This commit is contained in:
XUAN 2014-09-10 11:51:57 -07:00
parent b4b59ef749
commit b91ef0c502
5 changed files with 77 additions and 4 deletions

View File

@ -285,6 +285,9 @@ Release 2.6.0 - UNRELEASED
YARN-1458. FairScheduler: Zero weight can lead to livelock.
(Zhihai Xu via kasha)
YARN-2459. RM crashes if App gets rejected for any reason
and HA is enabled. (Jian He via xgong)
Release 2.5.1 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -401,7 +401,7 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
}
}
private Credentials parseCredentials(ApplicationSubmissionContext application)
protected Credentials parseCredentials(ApplicationSubmissionContext application)
throws IOException {
Credentials credentials = new Credentials();
DataInputByteBuffer dibb = new DataInputByteBuffer();

View File

@ -150,8 +150,10 @@ public class RMAppImpl implements RMApp, Recoverable {
RMAppEventType.RECOVER, new RMAppRecoveredTransition())
.addTransition(RMAppState.NEW, RMAppState.KILLED, RMAppEventType.KILL,
new AppKilledTransition())
.addTransition(RMAppState.NEW, RMAppState.FAILED,
RMAppEventType.APP_REJECTED, new AppRejectedTransition())
.addTransition(RMAppState.NEW, RMAppState.FINAL_SAVING,
RMAppEventType.APP_REJECTED,
new FinalSavingTransition(new AppRejectedTransition(),
RMAppState.FAILED))
// Transitions from NEW_SAVING state
.addTransition(RMAppState.NEW_SAVING, RMAppState.NEW_SAVING,

View File

@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
@ -92,6 +93,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.log4j.Level;
@ -1606,6 +1609,53 @@ public class TestRMRestart {
Assert.assertEquals(2, ((TestMemoryRMStateStore) memStore).updateApp);
}
// Test Application that fails on submission is saved in state store.
@Test (timeout = 20000)
public void testAppFailedOnSubmissionSavedInStateStore() throws Exception {
conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION,
"kerberos");
UserGroupInformation.setConfiguration(conf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new TestSecurityMockRM(conf, memStore) {
@Override
protected RMAppManager createRMAppManager() {
return new TestRMAppManager(this.rmContext, this.scheduler,
this.masterService, this.applicationACLsManager, conf);
}
class TestRMAppManager extends RMAppManager {
public TestRMAppManager(RMContext context, YarnScheduler scheduler,
ApplicationMasterService masterService,
ApplicationACLsManager applicationACLsManager, Configuration conf) {
super(context, scheduler, masterService, applicationACLsManager, conf);
}
@Override
protected Credentials parseCredentials(
ApplicationSubmissionContext application) throws IOException {
throw new IOException("Parsing credential error.");
}
}
};
rm1.start();
RMApp app1 =
rm1.submitApp(200, "name", "user",
new HashMap<ApplicationAccessType, String>(), false, "default", -1,
null, "MAPREDUCE", false);
rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
// Check app staet is saved in state store.
Assert.assertEquals(RMAppState.FAILED, memStore.getState()
.getApplicationState().get(app1.getApplicationId()).getState());
MockRM rm2 = new TestSecurityMockRM(conf, memStore);
rm2.start();
// Restarted RM has the failed app info too.
rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
}
@SuppressWarnings("resource")
@Test (timeout = 60000)
public void testQueueMetricsOnRMRestart() throws Exception {

View File

@ -526,10 +526,28 @@ public class TestRMAppTransitions {
rmDispatcher.await();
sendAppUpdateSavedEvent(application);
assertFailed(application, rejectedText);
assertAppFinalStateNotSaved(application);
assertAppFinalStateSaved(application);
verifyApplicationFinished(RMAppState.FAILED);
}
@Test (timeout = 30000)
public void testAppNewRejectAddToStore() throws IOException {
LOG.info("--- START: testAppNewRejectAddToStore ---");
RMApp application = createNewTestApp(null);
// NEW => FAILED event RMAppEventType.APP_REJECTED
String rejectedText = "Test Application Rejected";
RMAppEvent event =
new RMAppRejectedEvent(application.getApplicationId(), rejectedText);
application.handle(event);
rmDispatcher.await();
sendAppUpdateSavedEvent(application);
assertFailed(application, rejectedText);
assertAppFinalStateSaved(application);
verifyApplicationFinished(RMAppState.FAILED);
rmContext.getStateStore().removeApplication(application);
}
@Test (timeout = 30000)
public void testAppNewSavingKill() throws IOException {
LOG.info("--- START: testAppNewSavingKill ---");