From 3efcd51c3b3eb667d83e08b500bb7a7ea559fabe Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 22 Aug 2017 12:56:09 -0500 Subject: [PATCH] YARN-2416. InvalidStateTransitonException in ResourceManager if AMLauncher does not receive response for startContainers() call in time. Contributed by Jonathan Eagles --- .../rmapp/attempt/RMAppAttemptImpl.java | 25 ++++++++++++--- .../attempt/TestRMAppAttemptTransitions.java | 32 ++++++++++++------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 7d453bdfb33..d748860d874 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -184,7 +184,10 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { new ExpiredTransition(); private static final AttemptFailedTransition FAILED_TRANSITION = new AttemptFailedTransition(); - + private static final AMRegisteredTransition REGISTERED_TRANSITION = + new AMRegisteredTransition(); + private static final AMLaunchedTransition LAUNCHED_TRANSITION = + new AMLaunchedTransition(); private RMAppAttemptEvent eventCausingFinalSaving; private RMAppAttemptState targetedFinalState; private RMAppAttemptState recoveredFinalState; @@ -314,7 +317,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { // Transitions from ALLOCATED State .addTransition(RMAppAttemptState.ALLOCATED, RMAppAttemptState.LAUNCHED, - RMAppAttemptEventType.LAUNCHED, new AMLaunchedTransition()) + RMAppAttemptEventType.LAUNCHED, LAUNCHED_TRANSITION) .addTransition(RMAppAttemptState.ALLOCATED, RMAppAttemptState.FINAL_SAVING, RMAppAttemptEventType.LAUNCH_FAILED, new FinalSavingTransition(new LaunchFailedTransition(), @@ -328,6 +331,8 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { RMAppAttemptEventType.FAIL, new FinalSavingTransition(FAILED_TRANSITION, RMAppAttemptState.FAILED)) + .addTransition(RMAppAttemptState.ALLOCATED, RMAppAttemptState.RUNNING, + RMAppAttemptEventType.REGISTERED, REGISTERED_TRANSITION) .addTransition(RMAppAttemptState.ALLOCATED, RMAppAttemptState.FINAL_SAVING, RMAppAttemptEventType.CONTAINER_FINISHED, new FinalSavingTransition( @@ -335,7 +340,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { // Transitions from LAUNCHED State .addTransition(RMAppAttemptState.LAUNCHED, RMAppAttemptState.RUNNING, - RMAppAttemptEventType.REGISTERED, new AMRegisteredTransition()) + RMAppAttemptEventType.REGISTERED, REGISTERED_TRANSITION) .addTransition(RMAppAttemptState.LAUNCHED, EnumSet.of(RMAppAttemptState.LAUNCHED, RMAppAttemptState.FINAL_SAVING), RMAppAttemptEventType.CONTAINER_FINISHED, @@ -357,6 +362,8 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { RMAppAttemptState.FAILED)) // Transitions from RUNNING State + .addTransition(RMAppAttemptState.RUNNING, RMAppAttemptState.RUNNING, + RMAppAttemptEventType.LAUNCHED) .addTransition(RMAppAttemptState.RUNNING, RMAppAttemptState.FINAL_SAVING, RMAppAttemptEventType.UNREGISTERED, new AMUnregisteredTransition()) .addTransition(RMAppAttemptState.RUNNING, RMAppAttemptState.RUNNING, @@ -421,6 +428,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { RMAppAttemptState.FAILED, RMAppAttemptState.FAILED, EnumSet.of( + RMAppAttemptEventType.LAUNCHED, RMAppAttemptEventType.EXPIRE, RMAppAttemptEventType.KILL, RMAppAttemptEventType.FAIL, @@ -438,6 +446,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { new FinalTransition(RMAppAttemptState.FINISHED)) .addTransition(RMAppAttemptState.FINISHING, RMAppAttemptState.FINISHING, EnumSet.of( + RMAppAttemptEventType.LAUNCHED, RMAppAttemptEventType.UNREGISTERED, RMAppAttemptEventType.STATUS_UPDATE, RMAppAttemptEventType.CONTAINER_ALLOCATED, @@ -451,6 +460,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { RMAppAttemptState.FINISHED, RMAppAttemptState.FINISHED, EnumSet.of( + RMAppAttemptEventType.LAUNCHED, RMAppAttemptEventType.EXPIRE, RMAppAttemptEventType.UNREGISTERED, RMAppAttemptEventType.CONTAINER_ALLOCATED, @@ -1291,7 +1301,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { * 2) OR AMLivelinessMonitor expires this attempt (when am doesn't * heart beat back). */ - (new AMLaunchedTransition()).transition(appAttempt, event); + LAUNCHED_TRANSITION.transition(appAttempt, event); return RMAppAttemptState.LAUNCHED; } } @@ -1516,7 +1526,8 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { @Override public void transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { - if (event.getType() == RMAppAttemptEventType.LAUNCHED) { + if (event.getType() == RMAppAttemptEventType.LAUNCHED + || event.getType() == RMAppAttemptEventType.REGISTERED) { appAttempt.launchAMEndTime = System.currentTimeMillis(); long delay = appAttempt.launchAMEndTime - appAttempt.launchAMStartTime; @@ -1651,6 +1662,10 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { @Override public void transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { + if (!RMAppAttemptState.LAUNCHED.equals(appAttempt.getState())) { + // registered received before launch + LAUNCHED_TRANSITION.transition(appAttempt, event); + } long delay = System.currentTimeMillis() - appAttempt.launchAMEndTime; ClusterMetrics.getMetrics().addAMRegisterDelay(delay); RMAppAttemptRegistrationEvent registrationEvent diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java index 7702ab194b2..f6406ff3abd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java @@ -526,12 +526,9 @@ public class TestRMAppAttemptTransitions { verifyApplicationAttemptFinished(RMAppAttemptState.FAILED); } - /** - * {@link RMAppAttemptState#LAUNCHED} - */ - private void testAppAttemptLaunchedState(Container container) { - assertEquals(RMAppAttemptState.LAUNCHED, - applicationAttempt.getAppAttemptState()); + private void testAppAttemptLaunchedState(Container container, + RMAppAttemptState state) { + assertEquals(state, applicationAttempt.getAppAttemptState()); assertEquals(container, applicationAttempt.getMasterContainer()); if (UserGroupInformation.isSecurityEnabled()) { // ClientTokenMasterKey has been registered in SecretManager, it's able to @@ -686,13 +683,18 @@ public class TestRMAppAttemptTransitions { } private void launchApplicationAttempt(Container container) { + launchApplicationAttempt(container, RMAppAttemptState.LAUNCHED); + } + + private void launchApplicationAttempt(Container container, + RMAppAttemptState state) { applicationAttempt.handle( - new RMAppAttemptEvent(applicationAttempt.getAppAttemptId(), + new RMAppAttemptEvent(applicationAttempt.getAppAttemptId(), RMAppAttemptEventType.LAUNCHED)); - testAppAttemptLaunchedState(container); + testAppAttemptLaunchedState(container, state); } - + private void runApplicationAttempt(Container container, String host, int rpcPort, @@ -723,7 +725,7 @@ public class TestRMAppAttemptTransitions { when(submissionContext.getUnmanagedAM()).thenReturn(true); // submit AM and check it goes to LAUNCHED state scheduleApplicationAttempt(); - testAppAttemptLaunchedState(null); + testAppAttemptLaunchedState(null, RMAppAttemptState.LAUNCHED); verify(amLivelinessMonitor, times(1)).register( applicationAttempt.getAppAttemptId()); @@ -930,7 +932,15 @@ public class TestRMAppAttemptTransitions { applicationAttempt.createApplicationAttemptState()); testAppAttemptFailedState(amContainer, diagnostics); } - + + @Test(timeout = 10000) + public void testAllocatedToRunning() { + Container amContainer = allocateApplicationAttempt(); + // Register attempt event arrives before launched attempt event + runApplicationAttempt(amContainer, "host", 8042, "oldtrackingurl", false); + launchApplicationAttempt(amContainer, RMAppAttemptState.RUNNING); + } + @Test(timeout = 10000) public void testCreateAppAttemptReport() { RMAppAttemptState[] attemptStates = RMAppAttemptState.values();