From 3dd299a77083155a75e2c2fe2290c983aea5d5d1 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 9 Aug 2018 10:17:34 -0500 Subject: [PATCH] YARN-8331. Race condition in NM container launched after done. Contributed by Pradeep Ambati (cherry picked from commit cd04e954d2db27f0a15b7d1c492b7cdb656a51db) --- .../container/ContainerImpl.java | 13 +++++- .../launcher/ContainerLaunch.java | 12 ++--- .../launcher/ContainersLauncher.java | 14 +++++- .../container/TestContainer.java | 46 +++++++++++++++++-- 4 files changed, 71 insertions(+), 14 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index a51e33dccfb..24c833ed05d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -384,7 +384,7 @@ public class ContainerImpl implements Container { UPDATE_DIAGNOSTICS_TRANSITION) .addTransition(ContainerState.SCHEDULED, ContainerState.KILLING, ContainerEventType.KILL_CONTAINER, - new KillBeforeRunningTransition()) + new KillTransition()) .addTransition(ContainerState.SCHEDULED, ContainerState.SCHEDULED, ContainerEventType.UPDATE_CONTAINER_TOKEN, new NotifyContainerSchedulerOfUpdateTransition()) @@ -618,6 +618,9 @@ public class ContainerImpl implements Container { .addTransition(ContainerState.EXITED_WITH_SUCCESS, ContainerState.EXITED_WITH_SUCCESS, ContainerEventType.UPDATE_CONTAINER_TOKEN) + .addTransition(ContainerState.EXITED_WITH_SUCCESS, + ContainerState.EXITED_WITH_SUCCESS, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST) // From EXITED_WITH_FAILURE State .addTransition(ContainerState.EXITED_WITH_FAILURE, ContainerState.DONE, @@ -635,6 +638,9 @@ public class ContainerImpl implements Container { .addTransition(ContainerState.EXITED_WITH_FAILURE, ContainerState.EXITED_WITH_FAILURE, ContainerEventType.UPDATE_CONTAINER_TOKEN) + .addTransition(ContainerState.EXITED_WITH_FAILURE, + ContainerState.EXITED_WITH_FAILURE, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST) // From KILLING State. .addTransition(ContainerState.KILLING, @@ -694,6 +700,9 @@ public class ContainerImpl implements Container { .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, ContainerEventType.UPDATE_CONTAINER_TOKEN) + .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, + ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST) // From DONE .addTransition(ContainerState.DONE, ContainerState.DONE, @@ -714,6 +723,8 @@ public class ContainerImpl implements Container { // No transition - assuming container is on its way to completion .addTransition(ContainerState.DONE, ContainerState.DONE, ContainerEventType.UPDATE_CONTAINER_TOKEN) + .addTransition(ContainerState.DONE, ContainerState.DONE, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST) // create the topology tables .installTopology(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java index 550863a20ad..eb25ff2e1b6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java @@ -563,14 +563,10 @@ public class ContainerLaunch implements Callable { || exitCode == ExitCode.TERMINATED.getExitCode()) { // If the process was killed, Send container_cleanedup_after_kill and // just break out of this method. - - // If Container was killed before starting... NO need to do this. - if (!killedBeforeStart) { - dispatcher.getEventHandler().handle( - new ContainerExitEvent(containerId, - ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode, - diagnosticInfo.toString())); - } + dispatcher.getEventHandler().handle( + new ContainerExitEvent(containerId, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode, + diagnosticInfo.toString())); } else if (exitCode != 0) { handleContainerExitWithFailure(containerId, exitCode, containerLogDir, diagnosticInfo); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java index cfd5d6a95f3..7870f86471f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java @@ -23,6 +23,11 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ExecutorService; + +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -151,7 +156,14 @@ public class ContainersLauncher extends AbstractService case CLEANUP_CONTAINER_FOR_REINIT: ContainerLaunch launcher = running.remove(containerId); if (launcher == null) { - // Container not launched. So nothing needs to be done. + // Container not launched. + // triggering KILLING to CONTAINER_CLEANEDUP_AFTER_KILL transition. + dispatcher.getEventHandler().handle( + new ContainerExitEvent(containerId, + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, + Shell.WINDOWS ? ContainerExecutor.ExitCode.FORCE_KILLED.getExitCode() : + ContainerExecutor.ExitCode.TERMINATED.getExitCode(), + "Container terminated before launch.")); return; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java index 1a263eea197..daccfceab98 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java @@ -23,6 +23,7 @@ import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.mockito.Matchers.any; import static org.mockito.Matchers.argThat; +import static org.mockito.Matchers.refEq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.reset; import static org.mockito.Mockito.times; @@ -665,6 +666,17 @@ public class TestContainer { ContainerLaunch launcher = wc.launcher.running.get(wc.c.getContainerId()); wc.killContainer(); assertEquals(ContainerState.KILLING, wc.c.getContainerState()); + + // check that container cleanup hasn't started at this point. + LocalizationCleanupMatcher cleanupResources = + new LocalizationCleanupMatcher(wc.c); + verify(wc.localizerBus, times(0)).handle(argThat(cleanupResources)); + + // check if containerlauncher cleans up the container launch. + verify(wc.launcherBus) + .handle(refEq(new ContainersLauncherEvent(wc.c, + ContainersLauncherEventType.CLEANUP_CONTAINER), "timestamp")); + launcher.call(); wc.drainDispatcherEvents(); assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, @@ -677,6 +689,7 @@ public class TestContainer { assertEquals(ContainerState.DONE, wc.c.getContainerState()); assertEquals(killed + 1, metrics.getKilledContainers()); assertEquals(0, metrics.getRunningContainers()); + assertEquals(0, wc.launcher.running.size()); } finally { if (wc != null) { wc.finished(); @@ -1146,7 +1159,7 @@ public class TestContainer { ResourcesReleasedMatcher matchesReq = new ResourcesReleasedMatcher(wc.localResources, EnumSet.of( LocalResourceVisibility.PUBLIC, LocalResourceVisibility.PRIVATE, - LocalResourceVisibility.APPLICATION)); + LocalResourceVisibility.APPLICATION), wc.c); verify(wc.localizerBus, atLeastOnce()).handle(argThat(matchesReq)); } @@ -1162,13 +1175,35 @@ public class TestContainer { wc.c.getContainerId().toString()))); } - private static class ResourcesReleasedMatcher extends + // Argument matcher for matching container localization cleanup event. + private static class LocalizationCleanupMatcher extends ArgumentMatcher { + Container c; + + LocalizationCleanupMatcher(Container c){ + this.c = c; + } + + @Override + public boolean matches(Object o) { + if (!(o instanceof ContainerLocalizationCleanupEvent)) { + return false; + } + ContainerLocalizationCleanupEvent evt = + (ContainerLocalizationCleanupEvent) o; + + return (evt.getContainer() == c); + } + } + + private static class ResourcesReleasedMatcher extends + LocalizationCleanupMatcher { final HashSet resources = new HashSet(); ResourcesReleasedMatcher(Map allResources, - EnumSet vis) throws URISyntaxException { + EnumSet vis, Container c) throws URISyntaxException { + super(c); for (Entry e : allResources.entrySet()) { if (vis.contains(e.getValue().getVisibility())) { resources.add(new LocalResourceRequest(e.getValue())); @@ -1178,9 +1213,12 @@ public class TestContainer { @Override public boolean matches(Object o) { - if (!(o instanceof ContainerLocalizationCleanupEvent)) { + // match event type and container. + if(!super.matches(o)){ return false; } + + // match resources. ContainerLocalizationCleanupEvent evt = (ContainerLocalizationCleanupEvent) o; final HashSet expected =