From 5d86ce7b0d416ba18a0be3a07195de4dc28b5bc5 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 16 Mar 2017 09:31:20 -0500 Subject: [PATCH] YARN-4051. ContainerKillEvent lost when container is still recovering and application finishes. Contributed by sandflee (cherry picked from commit a16ba4296e163d5cb4caed129f2f1612a69a8d84) Conflicts: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java --- .../ContainerManagerImpl.java | 53 ++++++++++++++++--- .../application/ApplicationImpl.java | 4 +- .../containermanager/container/Container.java | 1 + .../container/ContainerImpl.java | 8 +++ .../TestContainerManagerRecovery.java | 18 ++++--- .../nodemanager/webapp/MockContainer.java | 5 ++ 6 files changed, 73 insertions(+), 16 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index 57272fbcf32..cca73acc504 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -340,15 +340,15 @@ public class ContainerManagerImpl extends CompositeService implements LOG.info("Recovering " + containerId + " in state " + rcs.getStatus() + " with exit code " + rcs.getExitCode()); - if (context.getApplications().containsKey(appId)) { + Application app = context.getApplications().get(appId); + if (app != null) { Credentials credentials = YarnServerSecurityUtils.parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), credentials, metrics, token, context, rcs); context.getContainers().put(containerId, container); - dispatcher.getEventHandler().handle( - new ApplicationContainerInitEvent(container)); + app.handle(new ApplicationContainerInitEvent(container)); } else { if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) { LOG.warn(containerId + " has no corresponding application!"); @@ -1168,6 +1168,10 @@ public class ContainerManagerImpl extends CompositeService implements + " is not handled by this NodeManager"); } } else { + if (container.isRecovering()) { + throw new NMNotYetReadyException("Container " + containerIDStr + + " is recovering, try later"); + } context.getNMStateStore().storeContainerKilled(containerID); dispatcher.getEventHandler().handle( new ContainerKillEvent(containerID, @@ -1313,6 +1317,21 @@ public class ContainerManagerImpl extends CompositeService implements + " FINISH_APPS event"); continue; } + + boolean shouldDropEvent = false; + for (Container container : app.getContainers().values()) { + if (container.isRecovering()) { + LOG.info("drop FINISH_APPS event to " + appID + " because " + + "container " + container.getContainerId() + + " is recovering"); + shouldDropEvent = true; + break; + } + } + if (shouldDropEvent) { + continue; + } + String diagnostic = ""; if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) { diagnostic = "Application killed on shutdown"; @@ -1327,10 +1346,32 @@ public class ContainerManagerImpl extends CompositeService implements case FINISH_CONTAINERS: CMgrCompletedContainersEvent containersFinishedEvent = (CMgrCompletedContainersEvent) event; - for (ContainerId container : containersFinishedEvent + for (ContainerId containerId : containersFinishedEvent .getContainersToCleanup()) { - this.dispatcher.getEventHandler().handle( - new ContainerKillEvent(container, + ApplicationId appId = + containerId.getApplicationAttemptId().getApplicationId(); + Application app = this.context.getApplications().get(appId); + if (app == null) { + LOG.warn("couldn't find app " + appId + " while processing" + + " FINISH_CONTAINERS event"); + continue; + } + + Container container = app.getContainers().get(containerId); + if (container == null) { + LOG.warn("couldn't find container " + containerId + + " while processing FINISH_CONTAINERS event"); + continue; + } + + if (container.isRecovering()) { + LOG.info("drop FINISH_CONTAINERS event to " + containerId + + " because container is recovering"); + continue; + } + + this.dispatcher.getEventHandler().handle( + new ContainerKillEvent(containerId, ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, "Container Killed by ResourceManager")); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java index efa258a467c..d614fc6c760 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java @@ -20,8 +20,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.application; import java.io.IOException; import java.util.EnumSet; -import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; @@ -77,7 +77,7 @@ public class ApplicationImpl implements Application { private LogAggregationContext logAggregationContext; Map containers = - new HashMap(); + new ConcurrentHashMap<>(); public ApplicationImpl(Dispatcher dispatcher, String user, ApplicationId appId, Credentials credentials, Context context) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index 1d2ec5687b8..8c63c9548c6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -57,4 +57,5 @@ public interface Container extends EventHandler { String toString(); + boolean isRecovering(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 052fe8d7bc4..9d1d3c77d7a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1188,4 +1188,12 @@ public class ContainerImpl implements Container { LocalResourceRequest resource) { return container.resourcesUploadPolicies.get(resource); } + + @Override + public boolean isRecovering() { + boolean isRecovering = ( + recoveredStatus != RecoveredContainerStatus.REQUESTED && + getContainerState() == ContainerState.NEW); + return isRecovering; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index 6cfa9141ac6..ee863c7c373 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -87,6 +87,7 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -246,8 +247,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { // simulate application completion List finishedApps = new ArrayList(); finishedApps.add(appId); - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); // restart and verify app is marked for finishing @@ -261,8 +262,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { assertNotNull(app); // no longer saving FINISH_APP event in NM stateStore, // simulate by resending FINISH_APP event - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); assertTrue(context.getApplicationACLsManager().checkAccess( UserGroupInformation.createRemoteUser(modUser), @@ -333,8 +334,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { // simulate application completion List finishedApps = new ArrayList(); finishedApps.add(appId); - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); app.handle(new ApplicationEvent(app.getAppId(), @@ -355,8 +356,9 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { // no longer saving FINISH_APP event in NM stateStore, // simulate by resending FINISH_APP event - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); + waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); // TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP // is needed. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 700d2e7ccc2..716fabb366e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -144,4 +144,9 @@ public class MockContainer implements Container { public NMContainerStatus getNMContainerStatus() { return null; } + + @Override + public boolean isRecovering() { + return false; + } }