YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence of races between the launch and the stop-container call and when root processes crash. Contributed by Billie Rinaldi.

(cherry picked from commit c5a46d4c8c)
This commit is contained in:
Vinod Kumar Vavilapalli 2014-11-03 16:38:55 -08:00
parent d2d2c84a10
commit 9c76dcadaf
3 changed files with 117 additions and 3 deletions

View File

@ -822,6 +822,10 @@ Release 2.6.0 - UNRELEASED
YARN-2795. Fixed ResourceManager to not crash loading node-label data from
HDFS in secure mode. (Wangda Tan via vinodkv)
YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence
of races between the launch and the stop-container call and when root
processes crash. (Billie Rinaldi via vinodkv)
Release 2.5.1 - 2014-09-05
INCOMPATIBLE CHANGES

View File

@ -461,9 +461,8 @@ public class ContainerLaunch implements Callable<Integer> {
final int sleepInterval = 100;
// loop waiting for pid file to show up
// until either the completed flag is set which means something bad
// happened or our timer expires in which case we admit defeat
while (!completed.get()) {
// until our timer expires in which case we admit defeat
while (true) {
processId = ProcessIdFileReader.getProcessId(pidFilePath);
if (processId != null) {
LOG.debug("Got pid " + processId + " for container "

View File

@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@ -969,4 +970,114 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
assertThat(e.getMessage(), containsString(expectedMessage));
}
}
@Test
public void testKillProcessGroup() throws Exception {
Assume.assumeTrue(Shell.isSetsidAvailable);
containerManager.start();
// Construct the Container-id
ApplicationId appId = ApplicationId.newInstance(2, 2);
ApplicationAttemptId appAttemptId =
ApplicationAttemptId.newInstance(appId, 1);
ContainerId cId = ContainerId.newInstance(appAttemptId, 0);
File processStartFile =
new File(tmpDir, "pid.txt").getAbsoluteFile();
File childProcessStartFile =
new File(tmpDir, "child_pid.txt").getAbsoluteFile();
// setup a script that can handle sigterm gracefully
File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript");
PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
writer.println("#!/bin/bash\n\n");
writer.println("echo \"Running testscript for forked process\"");
writer.println("umask 0");
writer.println("echo $$ >> " + processStartFile);
writer.println("while true;\ndo sleep 1s;\ndone > /dev/null 2>&1 &");
writer.println("echo $! >> " + childProcessStartFile);
writer.println("while true;\ndo sleep 1s;\ndone");
writer.close();
FileUtil.setExecutable(scriptFile, true);
ContainerLaunchContext containerLaunchContext =
recordFactory.newRecordInstance(ContainerLaunchContext.class);
// upload the script file so that the container can run it
URL resource_alpha =
ConverterUtils.getYarnUrlFromPath(localFS
.makeQualified(new Path(scriptFile.getAbsolutePath())));
LocalResource rsrc_alpha =
recordFactory.newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(scriptFile.lastModified());
String destinationFile = "dest_file.sh";
Map<String, LocalResource> localResources =
new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
// set up the rest of the container
List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
containerLaunchContext.setCommands(commands);
Priority priority = Priority.newInstance(10);
long createTime = 1234;
Token containerToken = createContainerToken(cId, priority, createTime);
StartContainerRequest scRequest =
StartContainerRequest.newInstance(containerLaunchContext,
containerToken);
List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
list.add(scRequest);
StartContainersRequest allRequests =
StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
LOG.info("Waiting for process start-file to be created");
}
Assert.assertTrue("ProcessStartFile doesn't exist!",
processStartFile.exists());
BufferedReader reader =
new BufferedReader(new FileReader(processStartFile));
// Get the pid of the process
String pid = reader.readLine().trim();
// No more lines
Assert.assertEquals(null, reader.readLine());
reader.close();
reader =
new BufferedReader(new FileReader(childProcessStartFile));
// Get the pid of the child process
String child = reader.readLine().trim();
// No more lines
Assert.assertEquals(null, reader.readLine());
reader.close();
LOG.info("Manually killing pid " + pid + ", but not child pid " + child);
Shell.execCommand(new String[]{"kill", "-9", pid});
BaseContainerManagerTest.waitForContainerState(containerManager, cId,
ContainerState.COMPLETE);
Assert.assertFalse("Process is still alive!",
DefaultContainerExecutor.containerIsAlive(pid));
List<ContainerId> containerIds = new ArrayList<ContainerId>();
containerIds.add(cId);
GetContainerStatusesRequest gcsRequest =
GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest)
.getContainerStatuses().get(0);
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
containerStatus.getExitStatus());
}
}