From fe7cb2d84ac160c5fed00640d85e2c5c4c6d2412 Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Thu, 17 Jan 2019 14:20:10 -0800 Subject: [PATCH] YARN-9194. Invalid event: REGISTERED and LAUNCH_FAILED at FAILED, and NullPointerException happens in RM while shutdown a NM. (lujie via wangda) Change-Id: I4359f59a73a278a941f4bb9d106dd38c9cb471fe (cherry picked from commit 6d7eedfd28cc1712690db2f6ca8a281b0901ee28) --- .../rmapp/attempt/RMAppAttemptImpl.java | 14 +++- .../attempt/TestRMAppAttemptTransitions.java | 80 ++++++++++++++++++- 2 files changed, 90 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 3ec9c498188..03039daae96 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -437,9 +437,11 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { RMAppAttemptState.FAILED, EnumSet.of( RMAppAttemptEventType.LAUNCHED, + RMAppAttemptEventType.LAUNCH_FAILED, RMAppAttemptEventType.EXPIRE, RMAppAttemptEventType.KILL, RMAppAttemptEventType.FAIL, + RMAppAttemptEventType.REGISTERED, RMAppAttemptEventType.UNREGISTERED, RMAppAttemptEventType.STATUS_UPDATE, RMAppAttemptEventType.CONTAINER_ALLOCATED)) @@ -1203,10 +1205,16 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { } // Set the masterContainer - appAttempt.setMasterContainer(amContainerAllocation.getContainers() - .get(0)); + Container amContainer = amContainerAllocation.getContainers().get(0); RMContainerImpl rmMasterContainer = (RMContainerImpl)appAttempt.scheduler - .getRMContainer(appAttempt.getMasterContainer().getId()); + .getRMContainer(amContainer.getId()); + //while one NM is removed, the scheduler will clean the container,the + //following CONTAINER_FINISHED event will handle the cleaned container. + //so just return RMAppAttemptState.SCHEDULED + if (rmMasterContainer == null) { + return RMAppAttemptState.SCHEDULED; + } + appAttempt.setMasterContainer(amContainer); rmMasterContainer.setAMContainer(true); // The node set in NMTokenSecrentManager is used for marking whether the // NMToken has been issued for this node to the AM. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java index 4a5c6719691..faecdb46607 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java @@ -986,7 +986,7 @@ public class TestRMAppAttemptTransitions { public void testAttemptAddedAtFinalSaving() { submitApplicationAttempt(); - // SUBNITED->FINAL_SAVING + // SUBMITTED->FINAL_SAVING applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt .getAppAttemptId(), RMAppAttemptEventType.KILL)); assertEquals(RMAppAttemptState.FINAL_SAVING, @@ -999,6 +999,56 @@ public class TestRMAppAttemptTransitions { applicationAttempt.getAppAttemptState()); } + @Test(timeout = 10000) + public void testAttemptRegisteredAtFailed() { + Container amContainer = allocateApplicationAttempt(); + launchApplicationAttempt(amContainer); + + //send CONTAINER_FINISHED event + NodeId anyNodeId = NodeId.newInstance("host", 1234); + applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent( + applicationAttempt.getAppAttemptId(), BuilderUtils.newContainerStatus( + amContainer.getId(), ContainerState.COMPLETE, "", 0, + amContainer.getResource()), anyNodeId)); + assertEquals(RMAppAttemptState.FINAL_SAVING, + applicationAttempt.getAppAttemptState()); + + sendAttemptUpdateSavedEvent(applicationAttempt); + assertEquals(RMAppAttemptState.FAILED, + applicationAttempt.getAppAttemptState()); + + //send REGISTERED event + applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt + .getAppAttemptId(), RMAppAttemptEventType.REGISTERED)); + + assertEquals(RMAppAttemptState.FAILED, + applicationAttempt.getAppAttemptState()); + } + + @Test + public void testAttemptLaunchFailedAtFailed() { + Container amContainer = allocateApplicationAttempt(); + launchApplicationAttempt(amContainer); + //send CONTAINER_FINISHED event + NodeId anyNodeId = NodeId.newInstance("host", 1234); + applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent( + applicationAttempt.getAppAttemptId(), BuilderUtils.newContainerStatus( + amContainer.getId(), ContainerState.COMPLETE, "", 0, + amContainer.getResource()), anyNodeId)); + assertEquals(RMAppAttemptState.FINAL_SAVING, + applicationAttempt.getAppAttemptState()); + sendAttemptUpdateSavedEvent(applicationAttempt); + assertEquals(RMAppAttemptState.FAILED, + applicationAttempt.getAppAttemptState()); + + //send LAUNCH_FAILED event + applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt + .getAppAttemptId(), RMAppAttemptEventType.LAUNCH_FAILED)); + + assertEquals(RMAppAttemptState.FAILED, + applicationAttempt.getAppAttemptState()); + } + @Test public void testAMCrashAtAllocated() { Container amContainer = allocateApplicationAttempt(); @@ -1598,6 +1648,34 @@ public class TestRMAppAttemptTransitions { assertTrue(found); } + @Test + public void testContainerRemovedBeforeAllocate() { + scheduleApplicationAttempt(); + + // Mock the allocation of AM container + Container container = mock(Container.class); + Resource resource = BuilderUtils.newResource(2048, 1); + when(container.getId()).thenReturn( + BuilderUtils.newContainerId(applicationAttempt.getAppAttemptId(), 1)); + when(container.getResource()).thenReturn(resource); + Allocation allocation = mock(Allocation.class); + when(allocation.getContainers()). + thenReturn(Collections.singletonList(container)); + when(scheduler.allocate(any(ApplicationAttemptId.class), any(List.class), + any(List.class), any(List.class), any(List.class), any(List.class), + any(ContainerUpdates.class))). + thenReturn(allocation); + + //container removed, so return null + when(scheduler.getRMContainer(container.getId())). + thenReturn(null); + + applicationAttempt.handle( + new RMAppAttemptEvent(applicationAttempt.getAppAttemptId(), + RMAppAttemptEventType.CONTAINER_ALLOCATED)); + assertEquals(RMAppAttemptState.SCHEDULED, + applicationAttempt.getAppAttemptState()); + } @SuppressWarnings("deprecation") @Test