From c5a46d4c8ca236ff641a309f256bbbdf4dd56db5 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Mon, 3 Nov 2014 16:38:55 -0800 Subject: [PATCH] YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence of races between the launch and the stop-container call and when root processes crash. Contributed by Billie Rinaldi. --- hadoop-yarn-project/CHANGES.txt | 4 + .../launcher/ContainerLaunch.java | 5 +- .../launcher/TestContainerLaunch.java | 111 ++++++++++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 25d03f4e8e5..f7c0dfa0605 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -852,6 +852,10 @@ Release 2.6.0 - UNRELEASED YARN-2795. Fixed ResourceManager to not crash loading node-label data from HDFS in secure mode. (Wangda Tan via vinodkv) + YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence + of races between the launch and the stop-container call and when root + processes crash. (Billie Rinaldi via vinodkv) + Release 2.5.1 - 2014-09-05 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java index 434cb4e4721..57e3bb9e84c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java @@ -461,9 +461,8 @@ public class ContainerLaunch implements Callable { final int sleepInterval = 100; // loop waiting for pid file to show up - // until either the completed flag is set which means something bad - // happened or our timer expires in which case we admit defeat - while (!completed.get()) { + // until our timer expires in which case we admit defeat + while (true) { processId = ProcessIdFileReader.getProcessId(pidFilePath); if (processId != null) { LOG.debug("Got pid " + processId + " for container " diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java index 186788d29a8..001643b434e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java @@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.event.Event; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; +import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode; import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -969,4 +970,114 @@ public class TestContainerLaunch extends BaseContainerManagerTest { assertThat(e.getMessage(), containsString(expectedMessage)); } } + + @Test + public void testKillProcessGroup() throws Exception { + Assume.assumeTrue(Shell.isSetsidAvailable); + containerManager.start(); + + // Construct the Container-id + ApplicationId appId = ApplicationId.newInstance(2, 2); + ApplicationAttemptId appAttemptId = + ApplicationAttemptId.newInstance(appId, 1); + ContainerId cId = ContainerId.newInstance(appAttemptId, 0); + File processStartFile = + new File(tmpDir, "pid.txt").getAbsoluteFile(); + File childProcessStartFile = + new File(tmpDir, "child_pid.txt").getAbsoluteFile(); + + // setup a script that can handle sigterm gracefully + File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript"); + PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile)); + writer.println("#!/bin/bash\n\n"); + writer.println("echo \"Running testscript for forked process\""); + writer.println("umask 0"); + writer.println("echo $$ >> " + processStartFile); + writer.println("while true;\ndo sleep 1s;\ndone > /dev/null 2>&1 &"); + writer.println("echo $! >> " + childProcessStartFile); + writer.println("while true;\ndo sleep 1s;\ndone"); + writer.close(); + FileUtil.setExecutable(scriptFile, true); + + ContainerLaunchContext containerLaunchContext = + recordFactory.newRecordInstance(ContainerLaunchContext.class); + + // upload the script file so that the container can run it + URL resource_alpha = + ConverterUtils.getYarnUrlFromPath(localFS + .makeQualified(new Path(scriptFile.getAbsolutePath()))); + LocalResource rsrc_alpha = + recordFactory.newRecordInstance(LocalResource.class); + rsrc_alpha.setResource(resource_alpha); + rsrc_alpha.setSize(-1); + rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION); + rsrc_alpha.setType(LocalResourceType.FILE); + rsrc_alpha.setTimestamp(scriptFile.lastModified()); + String destinationFile = "dest_file.sh"; + Map localResources = + new HashMap(); + localResources.put(destinationFile, rsrc_alpha); + containerLaunchContext.setLocalResources(localResources); + + // set up the rest of the container + List commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile)); + containerLaunchContext.setCommands(commands); + Priority priority = Priority.newInstance(10); + long createTime = 1234; + Token containerToken = createContainerToken(cId, priority, createTime); + + StartContainerRequest scRequest = + StartContainerRequest.newInstance(containerLaunchContext, + containerToken); + List list = new ArrayList(); + list.add(scRequest); + StartContainersRequest allRequests = + StartContainersRequest.newInstance(list); + containerManager.startContainers(allRequests); + + int timeoutSecs = 0; + while (!processStartFile.exists() && timeoutSecs++ < 20) { + Thread.sleep(1000); + LOG.info("Waiting for process start-file to be created"); + } + Assert.assertTrue("ProcessStartFile doesn't exist!", + processStartFile.exists()); + + BufferedReader reader = + new BufferedReader(new FileReader(processStartFile)); + // Get the pid of the process + String pid = reader.readLine().trim(); + // No more lines + Assert.assertEquals(null, reader.readLine()); + reader.close(); + + reader = + new BufferedReader(new FileReader(childProcessStartFile)); + // Get the pid of the child process + String child = reader.readLine().trim(); + // No more lines + Assert.assertEquals(null, reader.readLine()); + reader.close(); + + LOG.info("Manually killing pid " + pid + ", but not child pid " + child); + Shell.execCommand(new String[]{"kill", "-9", pid}); + + BaseContainerManagerTest.waitForContainerState(containerManager, cId, + ContainerState.COMPLETE); + + Assert.assertFalse("Process is still alive!", + DefaultContainerExecutor.containerIsAlive(pid)); + + List containerIds = new ArrayList(); + containerIds.add(cId); + + GetContainerStatusesRequest gcsRequest = + GetContainerStatusesRequest.newInstance(containerIds); + + ContainerStatus containerStatus = + containerManager.getContainerStatuses(gcsRequest) + .getContainerStatuses().get(0); + Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(), + containerStatus.getExitStatus()); + } }