MAPREDUCE-3240. Fixed NodeManager to be able to forcefully cleanup its containers (process-trees) irrespective of whether the container succeeded, or killed. Contributed by Hitesh Shah.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1189711 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2011-10-27 12:03:17 +00:00
parent f799e61269
commit 2fe343f963
23 changed files with 851 additions and 134 deletions

View File

@ -1804,6 +1804,10 @@ Release 0.23.0 - Unreleased
MAPREDUCE-3279. Fixed TestJobHistoryParsing which assumed user name to be
mapred all the time. (Siddharth Seth via acmurthy)
MAPREDUCE-3240. Fixed NodeManager to be able to forcefully cleanup its
containers (process-trees) irrespective of whether the container succeeded,
or killed. (Hitesh Shah via vinodkv)
Release 0.22.0 - Unreleased
INCOMPATIBLE CHANGES

View File

@ -166,8 +166,6 @@ public abstract class RMContainerRequestor extends RMCommunicator {
for (ResourceRequest req : reqMap.values()) {
if (!ask.remove(req)) {
foundAll = false;
}
else {
// if ask already sent to RM, we can try and overwrite it if possible.
// send a new ask to RM with numContainers
// specified for the blacklisted host to be 0.
@ -181,7 +179,7 @@ public abstract class RMContainerRequestor extends RMCommunicator {
// we can remove this request
if (foundAll) {
remoteRequests.remove(hostName);
}
}
}
// TODO handling of rack blacklisting
// Removing from rack should be dependent on no. of failures within the rack

View File

@ -411,6 +411,20 @@ public class YarnConfiguration extends Configuration {
YARN_SECURITY_SERVICE_AUTHORIZATION_RESOURCE_LOCALIZER =
"security.resourcelocalizer.protocol.acl";
/** No. of milliseconds to wait between sending a SIGTERM and SIGKILL
* to a running container */
public static final String NM_SLEEP_DELAY_BEFORE_SIGKILL_MS =
NM_PREFIX + "sleep-delay-before-sigkill.ms";
public static final long DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS =
250;
/** Max time to wait for a process to come up when trying to cleanup
* container resources */
public static final String NM_PROCESS_KILL_WAIT_MS =
NM_PREFIX + "process-kill-wait.ms";
public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
2000;
public YarnConfiguration() {
super();
}

View File

@ -366,6 +366,18 @@
<!-- <value>mapreduce.shuffle</value> -->
</property>
<property>
<description>No. of ms to wait between sending a SIGTERM and SIGKILL to a container</description>
<name>yarn.nodemanager.sleep-delay-before-sigkill.ms</name>
<value>250</value>
</property>
<property>
<description>Max time to wait for a process to come up when trying to cleanup a container</description>
<name>yarn.nodemanager.process-kill-wait.ms</name>
<value>2000</value>
</property>
<!--Map Reduce configuration-->
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>

View File

@ -19,12 +19,13 @@
package org.apache.hadoop.yarn.server.nodemanager;
import java.io.IOException;
import java.lang.reflect.Field;
import java.net.InetSocketAddress;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -35,6 +36,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
public abstract class ContainerExecutor implements Configurable {
@ -43,8 +45,12 @@ public abstract class ContainerExecutor implements Configurable {
FsPermission.createImmutable((short) 0700);
private Configuration conf;
protected ConcurrentMap<ContainerId, ShellCommandExecutor> launchCommandObjs =
new ConcurrentHashMap<ContainerId, ShellCommandExecutor>();
private ConcurrentMap<ContainerId, Path> pidFiles =
new ConcurrentHashMap<ContainerId, Path>();
private ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
private final ReadLock readLock = lock.readLock();
private final WriteLock writeLock = lock.writeLock();
@Override
public void setConf(Configuration conf) {
@ -102,7 +108,8 @@ public abstract class ContainerExecutor implements Configurable {
throws IOException, InterruptedException;
public enum ExitCode {
KILLED(137);
FORCE_KILLED(137),
TERMINATED(143);
private final int code;
private ExitCode(int exitCode) {
@ -149,6 +156,66 @@ public abstract class ContainerExecutor implements Configurable {
}
}
/**
* Get the pidFile of the container.
* @param containerId
* @return the path of the pid-file for the given containerId.
*/
protected Path getPidFilePath(ContainerId containerId) {
try {
readLock.lock();
return (this.pidFiles.get(containerId));
} finally {
readLock.unlock();
}
}
/**
* Is the container still active?
* @param containerId
* @return true if the container is active else false.
*/
protected boolean isContainerActive(ContainerId containerId) {
try {
readLock.lock();
return (this.pidFiles.containsKey(containerId));
} finally {
readLock.unlock();
}
}
/**
* Mark the container as active
*
* @param containerId
* the ContainerId
* @param pidFilePath
* Path where the executor should write the pid of the launched
* process
*/
public void activateContainer(ContainerId containerId, Path pidFilePath) {
try {
writeLock.lock();
this.pidFiles.put(containerId, pidFilePath);
} finally {
writeLock.unlock();
}
}
/**
* Mark the container as inactive.
* Done iff the container is still active. Else treat it as
* a no-op
*/
public void deactivateContainer(ContainerId containerId) {
try {
writeLock.lock();
this.pidFiles.remove(containerId);
} finally {
writeLock.unlock();
}
}
/**
* Get the process-identifier for the container
*
@ -158,28 +225,15 @@ public abstract class ContainerExecutor implements Configurable {
*/
public String getProcessId(ContainerId containerID) {
String pid = null;
ShellCommandExecutor shExec = launchCommandObjs.get(containerID);
if (shExec == null) {
Path pidFile = pidFiles.get(containerID);
if (pidFile == null) {
// This container isn't even launched yet.
return pid;
}
Process proc = shExec.getProcess();
if (proc == null) {
// This happens if the command is not yet started
return pid;
}
try {
Field pidField = proc.getClass().getDeclaredField("pid");
pidField.setAccessible(true);
pid = ((Integer) pidField.get(proc)).toString();
} catch (SecurityException e) {
// SecurityManager not expected with yarn. Ignore.
} catch (NoSuchFieldException e) {
// Yarn only on UNIX for now. Ignore.
} catch (IllegalArgumentException e) {
;
} catch (IllegalAccessException e) {
;
pid = ProcessIdFileReader.getProcessId(pidFile);
} catch (IOException e) {
LOG.error("Got exception reading pid from pid-file " + pidFile, e);
}
return pid;
}

View File

@ -18,10 +18,16 @@
package org.apache.hadoop.yarn.server.nodemanager;
import static org.apache.hadoop.fs.CreateFlag.CREATE;
import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.net.InetSocketAddress;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import org.apache.commons.logging.Log;
@ -48,6 +54,9 @@ public class DefaultContainerExecutor extends ContainerExecutor {
private final FileContext lfs;
private static final String WRAPPER_LAUNCH_SCRIPT =
"default_container_executor.sh";
public DefaultContainerExecutor() {
try {
this.lfs = FileContext.getLocalFSFileContext();
@ -100,8 +109,9 @@ public class DefaultContainerExecutor extends ContainerExecutor {
ConverterUtils.toString(
container.getContainerID().getApplicationAttemptId().
getApplicationId());
String[] sLocalDirs =
getConf().getStrings(YarnConfiguration.NM_LOCAL_DIRS, YarnConfiguration.DEFAULT_NM_LOCAL_DIRS);
String[] sLocalDirs = getConf().getStrings(
YarnConfiguration.NM_LOCAL_DIRS,
YarnConfiguration.DEFAULT_NM_LOCAL_DIRS);
for (String sLocalDir : sLocalDirs) {
Path usersdir = new Path(sLocalDir, ContainerLocalizer.USERCACHE);
Path userdir = new Path(usersdir, userName);
@ -124,21 +134,47 @@ public class DefaultContainerExecutor extends ContainerExecutor {
new Path(containerWorkDir, ContainerLaunch.FINAL_CONTAINER_TOKENS_FILE);
lfs.util().copy(nmPrivateTokensPath, tokenDst);
// Create new local launch wrapper script
Path wrapperScriptDst = new Path(containerWorkDir, WRAPPER_LAUNCH_SCRIPT);
DataOutputStream wrapperScriptOutStream =
lfs.create(wrapperScriptDst,
EnumSet.of(CREATE, OVERWRITE));
Path pidFile = getPidFilePath(containerId);
if (pidFile != null) {
writeLocalWrapperScript(wrapperScriptOutStream, launchDst.toUri()
.getPath().toString(), pidFile.toString());
} else {
LOG.info("Container " + containerIdStr
+ " was marked as inactive. Returning terminated error");
return ExitCode.TERMINATED.getExitCode();
}
// create log dir under app
// fork script
ShellCommandExecutor shExec = null;
try {
lfs.setPermission(launchDst,
ContainerExecutor.TASK_LAUNCH_SCRIPT_PERMISSION);
String[] command =
new String[] { "bash", "-c", launchDst.toUri().getPath().toString() };
lfs.setPermission(wrapperScriptDst,
ContainerExecutor.TASK_LAUNCH_SCRIPT_PERMISSION);
// Setup command to run
String[] command = {"bash", "-c",
wrapperScriptDst.toUri().getPath().toString()};
LOG.info("launchContainer: " + Arrays.toString(command));
shExec = new ShellCommandExecutor(
command,
new File(containerWorkDir.toUri().getPath()),
new File(containerWorkDir.toUri().getPath()),
container.getLaunchContext().getEnvironment()); // sanitized env
launchCommandObjs.put(containerId, shExec);
shExec.execute();
if (isContainerActive(containerId)) {
shExec.execute();
}
else {
LOG.info("Container " + containerIdStr +
" was marked as inactive. Returning terminated error");
return ExitCode.TERMINATED.getExitCode();
}
} catch (IOException e) {
if (null == shExec) {
return -1;
@ -151,17 +187,44 @@ public class DefaultContainerExecutor extends ContainerExecutor {
message));
return exitCode;
} finally {
launchCommandObjs.remove(containerId);
; //
}
return 0;
}
private void writeLocalWrapperScript(DataOutputStream out,
String launchScriptDst, String pidFilePath) throws IOException {
// We need to do a move as writing to a file is not atomic
// Process reading a file being written to may get garbled data
// hence write pid to tmp file first followed by a mv
StringBuilder sb = new StringBuilder("#!/bin/bash\n\n");
sb.append("echo $$ > " + pidFilePath + ".tmp\n");
sb.append("/bin/mv -f " + pidFilePath + ".tmp " + pidFilePath + "\n");
sb.append(ContainerExecutor.isSetsidAvailable? "exec setsid" : "exec");
sb.append(" /bin/bash ");
sb.append("-c ");
sb.append("\"");
sb.append(launchScriptDst);
sb.append("\"\n");
PrintStream pout = null;
try {
pout = new PrintStream(out);
pout.append(sb);
} finally {
if (out != null) {
out.close();
}
}
}
@Override
public boolean signalContainer(String user, String pid, Signal signal)
throws IOException {
final String sigpid = ContainerExecutor.isSetsidAvailable
? "-" + pid
: pid;
LOG.debug("Sending signal " + signal.getValue() + " to pid " + sigpid
+ " as user " + user);
try {
sendSignal(sigpid, Signal.NULL);
} catch (ExitCodeException e) {
@ -189,8 +252,8 @@ public class DefaultContainerExecutor extends ContainerExecutor {
*/
protected void sendSignal(String pid, Signal signal) throws IOException {
ShellCommandExecutor shexec = null;
String[] arg = { "kill", "-" + signal.getValue(), pid };
shexec = new ShellCommandExecutor(arg);
String[] arg = { "kill", "-" + signal.getValue(), pid };
shexec = new ShellCommandExecutor(arg);
shexec.execute();
}

View File

@ -155,36 +155,45 @@ public class LinuxContainerExecutor extends ContainerExecutor {
ContainerId containerId = container.getContainerID();
String containerIdStr = ConverterUtils.toString(containerId);
List<String> command = new ArrayList<String>(
Arrays.asList(containerExecutorExe,
user,
Integer.toString(Commands.LAUNCH_CONTAINER.getValue()),
appId,
containerIdStr,
containerWorkDir.toString(),
nmPrivateCotainerScriptPath.toUri().getPath().toString(),
nmPrivateTokensPath.toUri().getPath().toString()));
String[] commandArray = command.toArray(new String[command.size()]);
ShellCommandExecutor shExec =
new ShellCommandExecutor(
commandArray,
null, // NM's cwd
container.getLaunchContext().getEnvironment()); // sanitized env
launchCommandObjs.put(containerId, shExec);
// DEBUG
LOG.info("launchContainer: " + Arrays.toString(commandArray));
ShellCommandExecutor shExec = null;
try {
shExec.execute();
if (LOG.isDebugEnabled()) {
logOutput(shExec.getOutput());
Path pidFilePath = getPidFilePath(containerId);
if (pidFilePath != null) {
List<String> command = new ArrayList<String>(Arrays.asList(
containerExecutorExe, user, Integer
.toString(Commands.LAUNCH_CONTAINER.getValue()), appId,
containerIdStr, containerWorkDir.toString(),
nmPrivateCotainerScriptPath.toUri().getPath().toString(),
nmPrivateTokensPath.toUri().getPath().toString(), pidFilePath
.toString()));
String[] commandArray = command.toArray(new String[command.size()]);
shExec = new ShellCommandExecutor(commandArray, null, // NM's cwd
container.getLaunchContext().getEnvironment()); // sanitized env
// DEBUG
LOG.info("launchContainer: " + Arrays.toString(commandArray));
shExec.execute();
if (LOG.isDebugEnabled()) {
logOutput(shExec.getOutput());
}
} else {
LOG.info("Container was marked as inactive. Returning terminated error");
return ExitCode.TERMINATED.getExitCode();
}
} catch (ExitCodeException e) {
if (null == shExec) {
return -1;
}
int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container is : " + exitCode);
// 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
// terminated/killed forcefully. In all other cases, log the
// container-executor's output
if (exitCode != 143 && exitCode != 137) {
if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
&& exitCode != ExitCode.TERMINATED.getExitCode()) {
LOG.warn("Exception from container-launch : ", e);
logOutput(shExec.getOutput());
String diagnostics = "Exception from container-launch: \n"
@ -197,7 +206,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
}
return exitCode;
} finally {
launchCommandObjs.remove(containerId);
; //
}
if (LOG.isDebugEnabled()) {
LOG.debug("Output from LinuxContainerExecutor's launchContainer follows:");

View File

@ -168,8 +168,6 @@ public class ContainerImpl implements Container {
.addTransition(ContainerState.LOCALIZED, ContainerState.LOCALIZED,
ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
UPDATE_DIAGNOSTICS_TRANSITION)
// TODO race: Can lead to a CONTAINER_LAUNCHED event at state KILLING,
// and a container which will never be killed by the NM.
.addTransition(ContainerState.LOCALIZED, ContainerState.KILLING,
ContainerEventType.KILL_CONTAINER, new KillTransition())
@ -239,6 +237,13 @@ public class ContainerImpl implements Container {
ContainerState.DONE,
ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
CONTAINER_DONE_TRANSITION)
// Handle a launched container during killing stage is a no-op
// as cleanup container is always handled after launch container event
// in the container launcher
.addTransition(ContainerState.KILLING,
ContainerState.KILLING,
ContainerEventType.CONTAINER_LAUNCHED,
new ContainerTransition())
// From CONTAINER_CLEANEDUP_AFTER_KILL State.
.addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,

View File

@ -31,6 +31,7 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -44,11 +45,14 @@ import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.DelayedProcessKiller;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
@ -56,8 +60,10 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
import org.apache.hadoop.yarn.util.Apps;
import org.apache.hadoop.yarn.util.ConverterUtils;
public class ContainerLaunch implements Callable<Integer> {
private static final Log LOG = LogFactory.getLog(ContainerLaunch.class);
@ -65,12 +71,22 @@ public class ContainerLaunch implements Callable<Integer> {
public static final String CONTAINER_SCRIPT = "launch_container.sh";
public static final String FINAL_CONTAINER_TOKENS_FILE = "container_tokens";
private static final String PID_FILE_NAME_FMT = "%s.pid";
private final Dispatcher dispatcher;
private final ContainerExecutor exec;
private final Application app;
private final Container container;
private final Configuration conf;
private final LocalDirAllocator logDirsSelector;
private volatile AtomicBoolean shouldLaunchContainer = new AtomicBoolean(false);
private volatile AtomicBoolean completed = new AtomicBoolean(false);
private long sleepDelayBeforeSigKill = 250;
private long maxKillWaitTime = 2000;
private Path pidFilePath = null;
public ContainerLaunch(Configuration configuration, Dispatcher dispatcher,
ContainerExecutor exec, Application app, Container container) {
@ -80,6 +96,12 @@ public class ContainerLaunch implements Callable<Integer> {
this.container = container;
this.dispatcher = dispatcher;
this.logDirsSelector = new LocalDirAllocator(YarnConfiguration.NM_LOG_DIRS);
this.sleepDelayBeforeSigKill =
conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS,
YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS);
this.maxKillWaitTime =
conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS,
YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS);
}
@Override
@ -87,7 +109,8 @@ public class ContainerLaunch implements Callable<Integer> {
public Integer call() {
final ContainerLaunchContext launchContext = container.getLaunchContext();
final Map<Path,String> localResources = container.getLocalizedResources();
String containerIdStr = ConverterUtils.toString(container.getContainerID());
ContainerId containerID = container.getContainerID();
String containerIdStr = ConverterUtils.toString(containerID);
final String user = launchContext.getUser();
final List<String> command = launchContext.getCommands();
int ret = -1;
@ -145,6 +168,17 @@ public class ContainerLaunch implements Callable<Integer> {
+ ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr
+ Path.SEPARATOR + containerIdStr,
LocalDirAllocator.SIZE_UNKNOWN, this.conf, false);
String pidFileSuffix = String.format(ContainerLaunch.PID_FILE_NAME_FMT,
containerIdStr);
// pid file should be in nm private dir so that it is not
// accessible by users
pidFilePath = lDirAllocator.getLocalPathForWrite(
ResourceLocalizationService.NM_PRIVATE_DIR + Path.SEPARATOR
+ pidFileSuffix,
this.conf);
try {
// /////////// Write out the container-script in the nmPrivate space.
String[] localDirs =
@ -189,21 +223,36 @@ public class ContainerLaunch implements Callable<Integer> {
// LaunchContainer is a blocking call. We are here almost means the
// container is launched, so send out the event.
dispatcher.getEventHandler().handle(new ContainerEvent(
container.getContainerID(),
containerID,
ContainerEventType.CONTAINER_LAUNCHED));
ret =
exec.launchContainer(container, nmPrivateContainerScriptPath,
nmPrivateTokensPath, user, appIdStr, containerWorkDir);
// Check if the container is signalled to be killed.
if (!shouldLaunchContainer.compareAndSet(false, true)) {
LOG.info("Container " + containerIdStr + " not launched as "
+ "cleanup already called");
ret = ExitCode.TERMINATED.getExitCode();
}
else {
exec.activateContainer(containerID, pidFilePath);
ret =
exec.launchContainer(container, nmPrivateContainerScriptPath,
nmPrivateTokensPath, user, appIdStr, containerWorkDir);
}
} catch (Throwable e) {
LOG.warn("Failed to launch container", e);
dispatcher.getEventHandler().handle(new ContainerExitEvent(
launchContext.getContainerId(),
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret));
return ret;
} finally {
completed.set(true);
exec.deactivateContainer(containerID);
}
if (ret == ExitCode.KILLED.getExitCode()) {
LOG.debug("Container " + containerIdStr + " completed with exit code "
+ ret);
if (ret == ExitCode.FORCE_KILLED.getExitCode()
|| ret == ExitCode.TERMINATED.getExitCode()) {
// If the process was killed, Send container_cleanedup_after_kill and
// just break out of this method.
dispatcher.getEventHandler().handle(
@ -226,6 +275,114 @@ public class ContainerLaunch implements Callable<Integer> {
ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
return 0;
}
/**
* Cleanup the container.
* Cancels the launch if launch has not started yet or signals
* the executor to not execute the process if not already done so.
* Also, sends a SIGTERM followed by a SIGKILL to the process if
* the process id is available.
* @throws IOException
*/
public void cleanupContainer() throws IOException {
ContainerId containerId = container.getContainerID();
String containerIdStr = ConverterUtils.toString(containerId);
LOG.info("Cleaning up container " + containerIdStr);
// launch flag will be set to true if process already launched
boolean alreadyLaunched = !shouldLaunchContainer.compareAndSet(false, true);
if (!alreadyLaunched) {
LOG.info("Container " + containerIdStr + " not launched."
+ " No cleanup needed to be done");
return;
}
LOG.debug("Marking container " + containerIdStr + " as inactive");
// this should ensure that if the container process has not launched
// by this time, it will never be launched
exec.deactivateContainer(containerId);
LOG.debug("Getting pid for container " + containerIdStr + " to kill"
+ " from pid file "
+ (pidFilePath != null ? pidFilePath.toString() : "null"));
// however the container process may have already started
try {
// get process id from pid file if available
// else if shell is still active, get it from the shell
String processId = null;
if (pidFilePath != null) {
processId = getContainerPid(pidFilePath);
}
// kill process
if (processId != null) {
String user = container.getLaunchContext().getUser();
LOG.debug("Sending signal to pid " + processId
+ " as user " + user
+ " for container " + containerIdStr);
if (sleepDelayBeforeSigKill > 0) {
boolean result = exec.signalContainer(user,
processId, Signal.TERM);
LOG.debug("Sent signal to pid " + processId
+ " as user " + user
+ " for container " + containerIdStr
+ ", result=" + (result? "success" : "failed"));
new DelayedProcessKiller(user,
processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
}
}
} catch (Exception e) {
LOG.warn("Got error when trying to cleanup container " + containerIdStr
+ ", error=" + e.getMessage());
} finally {
// cleanup pid file if present
if (pidFilePath != null) {
FileContext lfs = FileContext.getLocalFSFileContext();
lfs.delete(pidFilePath, false);
}
}
}
/**
* Loop through for a time-bounded interval waiting to
* read the process id from a file generated by a running process.
* @param pidFilePath File from which to read the process id
* @return Process ID
* @throws Exception
*/
private String getContainerPid(Path pidFilePath) throws Exception {
String containerIdStr =
ConverterUtils.toString(container.getContainerID());
String processId = null;
LOG.debug("Accessing pid for container " + containerIdStr
+ " from pid file " + pidFilePath);
int sleepCounter = 0;
final int sleepInterval = 100;
// loop waiting for pid file to show up
// until either the completed flag is set which means something bad
// happened or our timer expires in which case we admit defeat
while (!completed.get()) {
processId = ProcessIdFileReader.getProcessId(pidFilePath);
if (processId != null) {
LOG.debug("Got pid " + processId + " for container "
+ containerIdStr);
break;
}
else if ((sleepCounter*sleepInterval) > maxKillWaitTime) {
LOG.info("Could not get pid for " + containerIdStr
+ ". Waited for " + maxKillWaitTime + " ms.");
break;
}
else {
++sleepCounter;
Thread.sleep(sleepInterval);
}
}
return processId;
}
private String getContainerPrivateDir(String appIdStr, String containerIdStr) {
return getAppPrivateDir(appIdStr) + Path.SEPARATOR + containerIdStr
@ -287,7 +444,7 @@ public class ContainerLaunch implements Callable<Integer> {
public String toString() {
return sb.toString();
}
}
private static void putEnvIfNotNull(
@ -374,9 +531,9 @@ public class ContainerLaunch implements Callable<Integer> {
sb.symlink(link.getKey(), link.getValue());
}
}
ArrayList<String> cmd = new ArrayList<String>(2 * command.size() + 5);
cmd.add(ContainerExecutor.isSetsidAvailable ? "exec setsid " : "exec ");
cmd.add("/bin/bash ");
cmd.add("exec /bin/bash ");
cmd.add("-c ");
cmd.add("\"");
for (String cs : command) {

View File

@ -26,15 +26,17 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@ -52,6 +54,8 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder;
public class ContainersLauncher extends AbstractService
implements EventHandler<ContainersLauncherEvent> {
private static final Log LOG = LogFactory.getLog(ContainersLauncher.class);
private final Context context;
private final ContainerExecutor exec;
private final Dispatcher dispatcher;
@ -64,13 +68,14 @@ public class ContainersLauncher extends AbstractService
Collections.synchronizedMap(new HashMap<ContainerId,RunningContainer>());
private static final class RunningContainer {
public RunningContainer(String string, Future<Integer> submit) {
this.user = string;
public RunningContainer(Future<Integer> submit,
ContainerLaunch launcher) {
this.runningcontainer = submit;
this.launcher = launcher;
}
String user;
Future<Integer> runningcontainer;
ContainerLaunch launcher;
}
@ -104,7 +109,6 @@ public class ContainersLauncher extends AbstractService
// TODO: ContainersLauncher launches containers one by one!!
Container container = event.getContainer();
ContainerId containerId = container.getContainerID();
String userName = container.getUser();
switch (event.getType()) {
case LAUNCH_CONTAINER:
Application app =
@ -114,33 +118,26 @@ public class ContainersLauncher extends AbstractService
new ContainerLaunch(getConfig(), dispatcher, exec, app,
event.getContainer());
running.put(containerId,
new RunningContainer(userName,
containerLauncher.submit(launch)));
new RunningContainer(containerLauncher.submit(launch),
launch));
break;
case CLEANUP_CONTAINER:
RunningContainer rContainerDatum = running.remove(containerId);
Future<Integer> rContainer = rContainerDatum.runningcontainer;
if (rContainer != null) {
if (rContainer.isDone()) {
// The future is already done by this time.
break;
}
// Cancel the future so that it won't be launched if it isn't already.
if (rContainer != null
&& !rContainer.isDone()) {
// Cancel the future so that it won't be launched
// if it isn't already.
rContainer.cancel(false);
// Kill the container
String processId = exec.getProcessId(containerId);
if (processId != null) {
try {
exec.signalContainer(rContainerDatum.user,
processId, Signal.KILL);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
// Cleanup a container whether it is running/killed/completed, so that
// no sub-processes are alive.
try {
rContainerDatum.launcher.cleanupContainer();
} catch (IOException e) {
LOG.warn("Got exception while cleaning container " + containerId
+ ". Ignoring.");
}
break;
}

View File

@ -0,0 +1,74 @@
package org.apache.hadoop.yarn.server.nodemanager.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
/**
* Helper functionality to read the pid from a file.
*/
public class ProcessIdFileReader {
private static final Log LOG = LogFactory.getLog(ProcessIdFileReader.class);
/**
* Get the process id from specified file path.
* Parses each line to find a valid number
* and returns the first one found.
* @return Process Id if obtained from path specified else null
* @throws IOException
*/
public static String getProcessId(Path path) throws IOException {
if (path == null) {
throw new IOException("Trying to access process id from a null path");
}
LOG.debug("Accessing pid from pid file " + path);
String processId = null;
FileReader fileReader = null;
BufferedReader bufReader = null;
try {
File file = new File(path.toString());
if (file.exists()) {
fileReader = new FileReader(file);
bufReader = new BufferedReader(fileReader);
while (true) {
String line = bufReader.readLine();
if (line == null) {
break;
}
String temp = line.trim();
if (!temp.isEmpty()) {
try {
Long pid = Long.valueOf(temp);
if (pid > 0) {
processId = temp;
break;
}
} catch (Exception e) {
// do nothing
}
}
}
}
} finally {
if (fileReader != null) {
fileReader.close();
}
if (bufReader != null) {
bufReader.close();
}
}
LOG.debug("Got pid "
+ (processId != null? processId : "null")
+ " from path " + path);
return processId;
}
}

View File

@ -45,6 +45,9 @@ FILE* ERRORFILE = NULL;
static uid_t nm_uid = -1;
static gid_t nm_gid = -1;
char *concatenate(char *concat_pattern, char *return_path_name,
int numArgs, ...);
void set_nm_uid(uid_t user, gid_t group) {
nm_uid = user;
nm_gid = group;
@ -147,6 +150,60 @@ static int change_effective_user(uid_t user, gid_t group) {
return 0;
}
/**
* Write the pid of the current process into the pid file.
* pid_file: Path to pid file where pid needs to be written to
*/
static int write_pid_to_file_as_nm(const char* pid_file, pid_t pid) {
uid_t user = geteuid();
gid_t group = getegid();
if (change_effective_user(nm_uid, nm_gid) != 0) {
return -1;
}
char *temp_pid_file = concatenate("%s.tmp", "pid_file_path", 1, pid_file);
// create with 700
int pid_fd = open(temp_pid_file, O_WRONLY|O_CREAT|O_EXCL, S_IRWXU);
if (pid_fd == -1) {
fprintf(LOGFILE, "Can't open file %s as node manager - %s\n", temp_pid_file,
strerror(errno));
free(temp_pid_file);
return -1;
}
// write pid to temp file
char pid_buf[21];
snprintf(pid_buf, 21, "%d", pid);
ssize_t written = write(pid_fd, pid_buf, strlen(pid_buf));
close(pid_fd);
if (written == -1) {
fprintf(LOGFILE, "Failed to write pid to file %s as node manager - %s\n",
temp_pid_file, strerror(errno));
free(temp_pid_file);
return -1;
}
// rename temp file to actual pid file
// use rename as atomic
if (rename(temp_pid_file, pid_file)) {
fprintf(LOGFILE, "Can't move pid file from %s to %s as node manager - %s\n",
temp_pid_file, pid_file, strerror(errno));
unlink(temp_pid_file);
free(temp_pid_file);
return -1;
}
// Revert back to the calling user.
if (change_effective_user(user, group)) {
free(temp_pid_file);
return -1;
}
free(temp_pid_file);
return 0;
}
/**
* Change the real and effective user and group to abandon the super user
* priviledges.
@ -749,7 +806,8 @@ int initialize_app(const char *user, const char *app_id,
int launch_container_as_user(const char *user, const char *app_id,
const char *container_id, const char *work_dir,
const char *script_name, const char *cred_file) {
const char *script_name, const char *cred_file,
const char* pid_file) {
int exit_code = -1;
char *script_file_dest = NULL;
char *cred_file_dest = NULL;
@ -776,6 +834,20 @@ int launch_container_as_user(const char *user, const char *app_id,
goto cleanup;
}
// setsid
pid_t pid = setsid();
if (pid == -1) {
exit_code = SETSID_OPER_FAILED;
goto cleanup;
}
// write pid to pidfile
if (pid_file == NULL
|| write_pid_to_file_as_nm(pid_file, pid) != 0) {
exit_code = WRITE_PIDFILE_FAILED;
goto cleanup;
}
// give up root privs
if (change_user(user_detail->pw_uid, user_detail->pw_gid) != 0) {
exit_code = SETUID_OPER_FAILED;
@ -1031,3 +1103,5 @@ int delete_as_user(const char *user,
}
return ret;
}

View File

@ -51,7 +51,9 @@ enum errorcodes {
UNABLE_TO_BUILD_PATH, //21
INVALID_CONTAINER_EXEC_PERMISSIONS, //22
// PREPARE_JOB_LOGS_FAILED (NOT USED) 23
INVALID_CONFIG_FILE = 24
INVALID_CONFIG_FILE = 24,
SETSID_OPER_FAILED = 25,
WRITE_PIDFILE_FAILED = 26
};
#define NM_GROUP_KEY "yarn.nodemanager.linux-container-executor.group"
@ -106,11 +108,13 @@ int initialize_app(const char *user, const char *app_id,
* @param script_name the name of the script to be run to launch the container.
* @param cred_file the credentials file that needs to be compied to the
* working directory.
* @param pid_file file where pid of process should be written to
* @return -1 or errorcode enum value on error (should never return on success).
*/
int launch_container_as_user(const char * user, const char *app_id,
const char *container_id, const char *work_dir,
const char *script_name, const char *cred_file);
const char *script_name, const char *cred_file,
const char *pid_file);
/**
* Function used to signal a container launched by the user.

View File

@ -66,6 +66,7 @@ int main(int argc, char **argv) {
const char * cred_file = NULL;
const char * script_file = NULL;
const char * current_dir = NULL;
const char * pid_file = NULL;
int exit_code = 0;
@ -141,7 +142,7 @@ int main(int argc, char **argv) {
argv + optind);
break;
case LAUNCH_CONTAINER:
if (argc < 8) {
if (argc < 9) {
fprintf(ERRORFILE, "Too few arguments (%d vs 8) for launch container\n",
argc);
fflush(ERRORFILE);
@ -152,8 +153,9 @@ int main(int argc, char **argv) {
current_dir = argv[optind++];
script_file = argv[optind++];
cred_file = argv[optind++];
pid_file = argv[optind++];
exit_code = launch_container_as_user(user_detail->pw_name, app_id, container_id,
current_dir, script_file, cred_file);
current_dir, script_file, cred_file, pid_file);
break;
case SIGNAL_CONTAINER:
if (argc < 5) {

View File

@ -590,6 +590,7 @@ void test_run_container() {
fflush(stderr);
char* container_dir = get_container_work_directory(TEST_ROOT "/local-1",
username, "app_4", "container_1");
const char * pid_file = TEST_ROOT "/pid.txt";
pid_t child = fork();
if (child == -1) {
printf("FAIL: failed to fork process for init_app - %s\n",
@ -597,7 +598,7 @@ void test_run_container() {
exit(1);
} else if (child == 0) {
if (launch_container_as_user(username, "app_4", "container_1",
container_dir, script_name, TEST_ROOT "creds.txt") != 0) {
container_dir, script_name, TEST_ROOT "/creds.txt", pid_file) != 0) {
printf("FAIL: failed in child\n");
exit(42);
}
@ -631,6 +632,32 @@ void test_run_container() {
exit(1);
}
free(container_dir);
if(access(pid_file, R_OK) != 0) {
printf("FAIL: failed to create pid file %s\n", pid_file);
exit(1);
}
int pidfd = open(pid_file, O_RDONLY);
if (pidfd == -1) {
printf("FAIL: failed to open pid file %s - %s\n", pid_file, strerror(errno));
exit(1);
}
char pidBuf[100];
ssize_t bytes = read(pidfd, pidBuf, 100);
if (bytes == -1) {
printf("FAIL: failed to read from pid file %s - %s\n", pid_file, strerror(errno));
exit(1);
}
pid_t mypid = child;
char myPidBuf[33];
snprintf(myPidBuf, 33, "%d", mypid);
if (strncmp(pidBuf, myPidBuf, strlen(myPidBuf)) != 0) {
printf("FAIL: failed to find matching pid in pid file\n");
printf("FAIL: Expected pid %d : Got %.*s", mypid, (int)bytes, pidBuf);
exit(1);
}
}
int main(int argc, char **argv) {

View File

@ -185,7 +185,9 @@ public class TestLinuxContainerExecutor {
Path scriptPath = new Path(script);
Path tokensPath = new Path("/dev/null");
Path workDir = new Path(workSpace.getAbsolutePath());
Path pidFile = new Path(workDir, "pid.txt");
exec.activateContainer(cId, pidFile);
return exec.launchContainer(container, scriptPath, tokensPath,
appSubmitter, appId, workDir);
}

View File

@ -60,7 +60,8 @@ public class TestLinuxContainerExecutorWithMocks {
private List<String> readMockParams() throws IOException {
LinkedList<String> ret = new LinkedList<String>();
LineNumberReader reader = new LineNumberReader(new FileReader(mockParamFile));
LineNumberReader reader = new LineNumberReader(new FileReader(
mockParamFile));
String line;
while((line = reader.readLine()) != null) {
ret.add(line);
@ -70,7 +71,7 @@ public class TestLinuxContainerExecutorWithMocks {
}
@Before
public void setup() throws IOException {
public void setup() {
File f = new File("./src/test/resources/mock-container-executor");
if(!f.canExecute()) {
f.setExecutable(true);
@ -83,7 +84,7 @@ public class TestLinuxContainerExecutorWithMocks {
}
@After
public void tearDown() throws IOException {
public void tearDown() {
deleteMockParamFile();
}
@ -109,11 +110,14 @@ public class TestLinuxContainerExecutorWithMocks {
Path scriptPath = new Path("file:///bin/echo");
Path tokensPath = new Path("file:///dev/null");
Path workDir = new Path("/tmp");
int ret = mockExec.launchContainer(container, scriptPath, tokensPath,
Path pidFile = new Path(workDir, "pid.txt");
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(container, scriptPath, tokensPath,
appSubmitter, appId, workDir);
assertEquals(0, ret);
assertEquals(Arrays.asList(appSubmitter, cmd, appId, containerId,
workDir.toString(), "/bin/echo", "/dev/null"),
assertEquals(Arrays.asList(appSubmitter, cmd, appId, containerId,
workDir.toString(), "/bin/echo", "/dev/null", pidFile),
readMockParams());
}
@ -141,4 +145,4 @@ public class TestLinuxContainerExecutorWithMocks {
assertEquals(Arrays.asList(appSubmitter, cmd, "/tmp/testdir"),
readMockParams());
}
}
}

View File

@ -280,7 +280,7 @@ public class TestContainerManager extends BaseContainerManagerTest {
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
Assert.assertEquals(ExitCode.KILLED.getExitCode(),
Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
containerStatus.getExitStatus());
// Assert that the process is not alive anymore

View File

@ -168,7 +168,7 @@ public class TestContainer {
wc.localizeResources();
wc.launchContainer();
reset(wc.localizerBus);
wc.containerFailed(ExitCode.KILLED.getExitCode());
wc.containerFailed(ExitCode.FORCE_KILLED.getExitCode());
assertEquals(ContainerState.EXITED_WITH_FAILURE,
wc.c.getContainerState());
verifyCleanupCall(wc);
@ -268,6 +268,26 @@ public class TestContainer {
}
}
@Test
public void testLaunchAfterKillRequest() throws Exception {
WrappedContainer wc = null;
try {
wc = new WrappedContainer(14, 314159265358979L, 4344, "yak");
wc.initContainer();
wc.localizeResources();
wc.killContainer();
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
wc.launchContainer();
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
wc.containerKilledOnRequest();
verifyCleanupCall(wc);
} finally {
if (wc != null) {
wc.finished();
}
}
}
private void verifyCleanupCall(WrappedContainer wc) throws Exception {
ResourcesReleasedMatcher matchesReq =
new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(
@ -511,7 +531,7 @@ public class TestContainer {
public void containerKilledOnRequest() {
c.handle(new ContainerExitEvent(cId,
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.KILLED
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ExitCode.FORCE_KILLED
.getExitCode()));
drainDispatcherEvents();
}

View File

@ -32,7 +32,6 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
@ -59,7 +58,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerM
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin;
import org.apache.hadoop.yarn.util.ProcfsBasedProcessTree;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.junit.Before;
import org.junit.Test;
@ -76,22 +74,21 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
conf.setClass(
YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR,
LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class);
conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 1000);
super.setup();
}
@Test
public void testSpecialCharSymlinks() throws IOException {
String rootDir = new File(System.getProperty(
"test.build.data", "/tmp")).getAbsolutePath();
File shellFile = null;
File tempFile = null;
String badSymlink = "foo@zz%_#*&!-+= bar()";
File symLinkFile = null;
try {
shellFile = new File(rootDir, "hello.sh");
tempFile = new File(rootDir, "temp.sh");
shellFile = new File(tmpDir, "hello.sh");
tempFile = new File(tmpDir, "temp.sh");
String timeoutCommand = "echo \"hello\"";
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
shellFile.setExecutable(true);
@ -113,15 +110,14 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
fos.close();
tempFile.setExecutable(true);
File rootDirFile = new File(rootDir);
Shell.ShellCommandExecutor shexc
= new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, rootDirFile);
= new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, tmpDir);
shexc.execute();
assertEquals(shexc.getExitCode(), 0);
assert(shexc.getOutput().contains("hello"));
symLinkFile = new File(rootDir, badSymlink);
symLinkFile = new File(tmpDir, badSymlink);
}
finally {
// cleanup
@ -141,6 +137,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
}
// this is a dirty hack - but should be ok for a unittest.
@SuppressWarnings({ "rawtypes", "unchecked" })
public static void setNewEnvironmentHack(Map<String, String> newenv) throws Exception {
Class[] classes = Collections.class.getDeclaredClasses();
Map<String, String> env = System.getenv();
@ -162,7 +159,6 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
*/
@Test
public void testContainerEnvVariables() throws Exception {
int exitCode = 0;
containerManager.start();
Map<String, String> envWithDummy = new HashMap<String, String>();
@ -217,7 +213,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
// set up the rest of the container
containerLaunchContext.setUser(containerLaunchContext.getUser());
List<String> commands = new ArrayList<String>();
@ -226,11 +222,11 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
containerLaunchContext.setCommands(commands);
containerLaunchContext.setResource(recordFactory
.newRecordInstance(Resource.class));
containerLaunchContext.getResource().setMemory(100 * 1024 * 1024);
containerLaunchContext.getResource().setMemory(1024);
StartContainerRequest startRequest = recordFactory.newRecordInstance(StartContainerRequest.class);
startRequest.setContainerLaunchContext(containerLaunchContext);
containerManager.startContainer(startRequest);
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
@ -238,7 +234,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
}
Assert.assertTrue("ProcessStartFile doesn't exist!",
processStartFile.exists());
// Now verify the contents of the file
BufferedReader reader =
new BufferedReader(new FileReader(processStartFile));
@ -265,13 +261,13 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
BaseContainerManagerTest.waitForContainerState(containerManager, cId,
ContainerState.COMPLETE);
GetContainerStatusRequest gcsRequest =
recordFactory.newRecordInstance(GetContainerStatusRequest.class);
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
Assert.assertEquals(ExitCode.KILLED.getExitCode(),
Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
containerStatus.getExitStatus());
// Assert that the process is not alive anymore
@ -279,4 +275,119 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
exec.signalContainer(user,
pid, Signal.NULL));
}
@Test
public void testDelayedKill() throws Exception {
containerManager.start();
File processStartFile =
new File(tmpDir, "pid.txt").getAbsoluteFile();
// setup a script that can handle sigterm gracefully
File scriptFile = new File(tmpDir, "testscript.sh");
PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
writer.println("#!/bin/bash\n\n");
writer.println("echo \"Running testscript for delayed kill\"");
writer.println("hello=\"Got SIGTERM\"");
writer.println("umask 0");
writer.println("trap \"echo $hello >> " + processStartFile + "\" SIGTERM");
writer.println("echo \"Writing pid to start file\"");
writer.println("echo $$ >> " + processStartFile);
writer.println("while true; do\nsleep 1s;\ndone");
writer.close();
scriptFile.setExecutable(true);
ContainerLaunchContext containerLaunchContext =
recordFactory.newRecordInstance(ContainerLaunchContext.class);
// ////// Construct the Container-id
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
appId.setClusterTimestamp(1);
appId.setId(1);
ApplicationAttemptId appAttemptId =
recordFactory.newRecordInstance(ApplicationAttemptId.class);
appAttemptId.setApplicationId(appId);
appAttemptId.setAttemptId(1);
ContainerId cId =
recordFactory.newRecordInstance(ContainerId.class);
cId.setApplicationAttemptId(appAttemptId);
containerLaunchContext.setContainerId(cId);
containerLaunchContext.setUser(user);
// upload the script file so that the container can run it
URL resource_alpha =
ConverterUtils.getYarnUrlFromPath(localFS
.makeQualified(new Path(scriptFile.getAbsolutePath())));
LocalResource rsrc_alpha =
recordFactory.newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(scriptFile.lastModified());
String destinationFile = "dest_file.sh";
Map<String, LocalResource> localResources =
new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
// set up the rest of the container
containerLaunchContext.setUser(containerLaunchContext.getUser());
List<String> commands = new ArrayList<String>();
commands.add(scriptFile.getAbsolutePath());
containerLaunchContext.setCommands(commands);
containerLaunchContext.setResource(recordFactory
.newRecordInstance(Resource.class));
containerLaunchContext.getResource().setMemory(1024);
StartContainerRequest startRequest = recordFactory.newRecordInstance(StartContainerRequest.class);
startRequest.setContainerLaunchContext(containerLaunchContext);
containerManager.startContainer(startRequest);
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
LOG.info("Waiting for process start-file to be created");
}
Assert.assertTrue("ProcessStartFile doesn't exist!",
processStartFile.exists());
// Now test the stop functionality.
StopContainerRequest stopRequest = recordFactory.newRecordInstance(StopContainerRequest.class);
stopRequest.setContainerId(cId);
containerManager.stopContainer(stopRequest);
BaseContainerManagerTest.waitForContainerState(containerManager, cId,
ContainerState.COMPLETE);
// container stop sends a sigterm followed by a sigkill
GetContainerStatusRequest gcsRequest =
recordFactory.newRecordInstance(GetContainerStatusRequest.class);
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
containerStatus.getExitStatus());
// Now verify the contents of the file
// Script generates a message when it receives a sigterm
// so we look for that
BufferedReader reader =
new BufferedReader(new FileReader(processStartFile));
boolean foundSigTermMessage = false;
while (true) {
String line = reader.readLine();
if (line == null) {
break;
}
if (line.contains("SIGTERM")) {
foundSigTermMessage = true;
break;
}
}
Assert.assertTrue("Did not find sigterm message", foundSigTermMessage);
reader.close();
}
}

View File

@ -262,7 +262,7 @@ public class TestContainersMonitor extends BaseContainerManagerTest {
gcsRequest.setContainerId(cId);
ContainerStatus containerStatus =
containerManager.getContainerStatus(gcsRequest).getStatus();
Assert.assertEquals(ExitCode.KILLED.getExitCode(),
Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
containerStatus.getExitStatus());
String expectedMsgPattern =
"Container \\[pid=" + pid + ",containerID=" + cId

View File

@ -0,0 +1,86 @@
package org.apache.hadoop.yarn.server.nodemanager.util;
import static org.junit.Assert.*;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import junit.framework.Assert;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
import org.junit.Test;
public class TestProcessIdFileReader {
@Test
public void testNullPath() {
String pid = null;
try {
pid = ProcessIdFileReader.getProcessId(null);
fail("Expected an error to be thrown for null path");
} catch (Exception e) {
// expected
}
assert(pid == null);
}
@Test
public void testSimpleGet() throws IOException {
String rootDir = new File(System.getProperty(
"test.build.data", "/tmp")).getAbsolutePath();
File testFile = null;
try {
testFile = new File(rootDir, "temp.txt");
PrintWriter fileWriter = new PrintWriter(testFile);
fileWriter.println("56789");
fileWriter.close();
String processId = null;
processId = ProcessIdFileReader.getProcessId(
new Path(rootDir + Path.SEPARATOR + "temp.txt"));
Assert.assertEquals("56789", processId);
} finally {
if (testFile != null
&& testFile.exists()) {
testFile.delete();
}
}
}
@Test
public void testComplexGet() throws IOException {
String rootDir = new File(System.getProperty(
"test.build.data", "/tmp")).getAbsolutePath();
File testFile = null;
try {
testFile = new File(rootDir, "temp.txt");
PrintWriter fileWriter = new PrintWriter(testFile);
fileWriter.println(" ");
fileWriter.println("");
fileWriter.println("abc");
fileWriter.println("-123");
fileWriter.println("-123 ");
fileWriter.println(" 23 ");
fileWriter.println("6236");
fileWriter.close();
String processId = null;
processId = ProcessIdFileReader.getProcessId(
new Path(rootDir + Path.SEPARATOR + "temp.txt"));
Assert.assertEquals("23", processId);
} finally {
if (testFile != null
&& testFile.exists()) {
testFile.delete();
}
}
}
}

View File

@ -5,6 +5,6 @@ do
done > params.txt
if [[ "$2" == "1" ]];
then
cd $5;
exec $6;
cd $6;
exec $7;
fi;