YARN-1070. Fixed race conditions in NodeManager during container-kill. Contributed by Zhijie Shen.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1527827 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
84e4e4b143
commit
74d20250ff
|
@ -101,7 +101,11 @@ Release 2.1.2 - UNRELEASED
|
|||
YARN-1221. With Fair Scheduler, reserved MB reported in RM web UI increases
|
||||
indefinitely (Siqi Li via Sandy Ryza)
|
||||
|
||||
YARN-1247. test-container-executor has gotten out of sync with the changes to container-executor. (rvs via tucu)
|
||||
YARN-1247. test-container-executor has gotten out of sync with the changes to
|
||||
container-executor. (rvs via tucu)
|
||||
|
||||
YARN-1070. Fixed race conditions in NodeManager during container-kill.
|
||||
(Zhijie Shen via vinodkv)
|
||||
|
||||
Release 2.1.1-beta - 2013-09-23
|
||||
|
||||
|
|
|
@ -68,6 +68,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
|
||||
|
@ -133,10 +134,22 @@ public class ContainerLaunch implements Callable<Integer> {
|
|||
final List<String> command = launchContext.getCommands();
|
||||
int ret = -1;
|
||||
|
||||
// CONTAINER_KILLED_ON_REQUEST should not be missed if the container
|
||||
// is already at KILLING
|
||||
if (container.getContainerState() == ContainerState.KILLING) {
|
||||
dispatcher.getEventHandler().handle(
|
||||
new ContainerExitEvent(containerID,
|
||||
ContainerEventType.CONTAINER_KILLED_ON_REQUEST,
|
||||
Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode(),
|
||||
"Container terminated before launch."));
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
localResources = container.getLocalizedResources();
|
||||
if (localResources == null) {
|
||||
RPCUtil.getRemoteException(
|
||||
throw RPCUtil.getRemoteException(
|
||||
"Unable to get local resources when Container " + containerID +
|
||||
" is at " + container.getContainerState());
|
||||
}
|
||||
|
|
|
@ -75,20 +75,9 @@ public class ContainersLauncher extends AbstractService
|
|||
new ThreadFactoryBuilder()
|
||||
.setNameFormat("ContainersLauncher #%d")
|
||||
.build());
|
||||
private final Map<ContainerId,RunningContainer> running =
|
||||
Collections.synchronizedMap(new HashMap<ContainerId,RunningContainer>());
|
||||
|
||||
private static final class RunningContainer {
|
||||
public RunningContainer(Future<Integer> submit,
|
||||
ContainerLaunch launcher) {
|
||||
this.runningcontainer = submit;
|
||||
this.launcher = launcher;
|
||||
}
|
||||
|
||||
Future<Integer> runningcontainer;
|
||||
ContainerLaunch launcher;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public final Map<ContainerId, ContainerLaunch> running =
|
||||
Collections.synchronizedMap(new HashMap<ContainerId, ContainerLaunch>());
|
||||
|
||||
public ContainersLauncher(Context context, Dispatcher dispatcher,
|
||||
ContainerExecutor exec, LocalDirsHandlerService dirsHandler,
|
||||
|
@ -133,38 +122,20 @@ public class ContainersLauncher extends AbstractService
|
|||
ContainerLaunch launch =
|
||||
new ContainerLaunch(context, getConfig(), dispatcher, exec, app,
|
||||
event.getContainer(), dirsHandler, containerManager);
|
||||
running.put(containerId,
|
||||
new RunningContainer(containerLauncher.submit(launch),
|
||||
launch));
|
||||
containerLauncher.submit(launch);
|
||||
running.put(containerId, launch);
|
||||
break;
|
||||
case CLEANUP_CONTAINER:
|
||||
RunningContainer rContainerDatum = running.remove(containerId);
|
||||
if (rContainerDatum == null) {
|
||||
ContainerLaunch launcher = running.remove(containerId);
|
||||
if (launcher == null) {
|
||||
// Container not launched. So nothing needs to be done.
|
||||
return;
|
||||
}
|
||||
Future<Integer> rContainer = rContainerDatum.runningcontainer;
|
||||
if (rContainer != null
|
||||
&& !rContainer.isDone()) {
|
||||
// Cancel the future so that it won't be launched if it isn't already.
|
||||
// If it is going to be canceled, make sure CONTAINER_KILLED_ON_REQUEST
|
||||
// will not be missed if the container is already at KILLING
|
||||
if (rContainer.cancel(false)) {
|
||||
if (container.getContainerState() == ContainerState.KILLING) {
|
||||
dispatcher.getEventHandler().handle(
|
||||
new ContainerExitEvent(containerId,
|
||||
ContainerEventType.CONTAINER_KILLED_ON_REQUEST,
|
||||
Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode(),
|
||||
"Container terminated before launch."));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup a container whether it is running/killed/completed, so that
|
||||
// no sub-processes are alive.
|
||||
try {
|
||||
rContainerDatum.launcher.cleanupContainer();
|
||||
launcher.cleanupContainer();
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Got exception while cleaning container " + containerId
|
||||
+ ". Ignoring.");
|
||||
|
|
|
@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|||
import org.apache.hadoop.yarn.event.DrainDispatcher;
|
||||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent;
|
||||
|
@ -72,6 +73,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEve
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEventType;
|
||||
|
@ -296,8 +298,7 @@ public class TestContainer {
|
|||
wc.launchContainer();
|
||||
reset(wc.localizerBus);
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
|
||||
wc.c.getContainerState());
|
||||
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
|
||||
assertNull(wc.c.getLocalizedResources());
|
||||
wc.containerKilledOnRequest();
|
||||
|
||||
|
@ -330,14 +331,18 @@ public class TestContainer {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testKillOnLocalized() throws Exception {
|
||||
public void testKillOnLocalizedWhenContainerNotLaunched() throws Exception {
|
||||
WrappedContainer wc = null;
|
||||
try {
|
||||
wc = new WrappedContainer(17, 314159265358979L, 4344, "yak");
|
||||
wc.initContainer();
|
||||
wc.localizeResources();
|
||||
assertEquals(ContainerState.LOCALIZED, wc.c.getContainerState());
|
||||
ContainerLaunch launcher = wc.launcher.running.get(wc.c.getContainerId());
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
|
||||
launcher.call();
|
||||
wc.drainDispatcherEvents();
|
||||
assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
|
||||
wc.c.getContainerState());
|
||||
assertNull(wc.c.getLocalizedResources());
|
||||
|
@ -348,6 +353,31 @@ public class TestContainer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKillOnLocalizedWhenContainerLaunched() throws Exception {
|
||||
WrappedContainer wc = null;
|
||||
try {
|
||||
wc = new WrappedContainer(17, 314159265358979L, 4344, "yak");
|
||||
wc.initContainer();
|
||||
wc.localizeResources();
|
||||
assertEquals(ContainerState.LOCALIZED, wc.c.getContainerState());
|
||||
ContainerLaunch launcher = wc.launcher.running.get(wc.c.getContainerId());
|
||||
launcher.call();
|
||||
wc.drainDispatcherEvents();
|
||||
assertEquals(ContainerState.EXITED_WITH_FAILURE,
|
||||
wc.c.getContainerState());
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.EXITED_WITH_FAILURE,
|
||||
wc.c.getContainerState());
|
||||
assertNull(wc.c.getLocalizedResources());
|
||||
verifyCleanupCall(wc);
|
||||
} finally {
|
||||
if (wc != null) {
|
||||
wc.finished();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResourceLocalizedOnLocalizationFailed() throws Exception {
|
||||
|
@ -472,12 +502,10 @@ public class TestContainer {
|
|||
wc.initContainer();
|
||||
wc.localizeResources();
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
|
||||
wc.c.getContainerState());
|
||||
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
|
||||
assertNull(wc.c.getLocalizedResources());
|
||||
wc.launchContainer();
|
||||
assertEquals(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
|
||||
wc.c.getContainerState());
|
||||
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
|
||||
assertNull(wc.c.getLocalizedResources());
|
||||
wc.containerKilledOnRequest();
|
||||
verifyCleanupCall(wc);
|
||||
|
@ -650,7 +678,9 @@ public class TestContainer {
|
|||
Context context = mock(Context.class);
|
||||
when(context.getApplications()).thenReturn(
|
||||
new ConcurrentHashMap<ApplicationId, Application>());
|
||||
launcher = new ContainersLauncher(context, dispatcher, null, null, null);
|
||||
ContainerExecutor executor = mock(ContainerExecutor.class);
|
||||
launcher =
|
||||
new ContainersLauncher(context, dispatcher, executor, null, null);
|
||||
// create a mock ExecutorService, which will not really launch
|
||||
// ContainerLaunch at all.
|
||||
launcher.containerLauncher = mock(ExecutorService.class);
|
||||
|
|
|
@ -682,9 +682,8 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
|
|||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest)
|
||||
.getContainerStatuses().get(0);
|
||||
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode();
|
||||
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
|
||||
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
|
||||
containerStatus.getExitStatus());
|
||||
|
||||
// Now verify the contents of the file. Script generates a message when it
|
||||
// receives a sigterm so we look for that. We cannot perform this check on
|
||||
|
|
Loading…
Reference in New Issue