YARN-814. Improving diagnostics when containers fail during launch due to various reasons like invalid env etc. Contributed by Jian He.
svn merge --ignore-ancestry -c 1504732 ../../trunk/ git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1504733 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
62f557775b
commit
384c86f6e1
|
@ -57,6 +57,9 @@ Release 2.1.1-beta - UNRELEASED
|
|||
YARN-62. Modified NodeManagers to avoid AMs from abusing container tokens for
|
||||
repetitive container launches. (Omkar Vinit Joshi via vinodkv)
|
||||
|
||||
YARN-814. Improving diagnostics when containers fail during launch due to
|
||||
various reasons like invalid env etc. (Jian He via vinodkv)
|
||||
|
||||
Release 2.1.0-beta - 2013-07-02
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -39,8 +39,10 @@ import org.apache.hadoop.util.Shell.ShellCommandExecutor;
|
|||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
public abstract class ContainerExecutor implements Configurable {
|
||||
|
||||
|
@ -298,15 +300,16 @@ public abstract class ContainerExecutor implements Configurable {
|
|||
}
|
||||
|
||||
public static class DelayedProcessKiller extends Thread {
|
||||
private Container container;
|
||||
private final String user;
|
||||
private final String pid;
|
||||
private final long delay;
|
||||
private final Signal signal;
|
||||
private final ContainerExecutor containerExecutor;
|
||||
|
||||
public DelayedProcessKiller(String user, String pid, long delay,
|
||||
Signal signal,
|
||||
ContainerExecutor containerExecutor) {
|
||||
public DelayedProcessKiller(Container container, String user, String pid,
|
||||
long delay, Signal signal, ContainerExecutor containerExecutor) {
|
||||
this.container = container;
|
||||
this.user = user;
|
||||
this.pid = pid;
|
||||
this.delay = delay;
|
||||
|
@ -323,7 +326,11 @@ public abstract class ContainerExecutor implements Configurable {
|
|||
} catch (InterruptedException e) {
|
||||
return;
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Exception when killing task " + pid, e);
|
||||
String message = "Exception when user " + user + " killing task " + pid
|
||||
+ " in DelayedProcessKiller: " + StringUtils.stringifyException(e);
|
||||
LOG.warn(message);
|
||||
container.handle(new ContainerDiagnosticsUpdateEvent(container
|
||||
.getContainerId(), message));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,6 +41,7 @@ import org.apache.hadoop.io.IOUtils;
|
|||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.util.Shell.ExitCodeException;
|
||||
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
|
@ -203,11 +204,23 @@ public class DefaultContainerExecutor extends ContainerExecutor {
|
|||
return -1;
|
||||
}
|
||||
int exitCode = shExec.getExitCode();
|
||||
LOG.warn("Exit code from task is : " + exitCode);
|
||||
String message = shExec.getOutput();
|
||||
logOutput(message);
|
||||
container.handle(new ContainerDiagnosticsUpdateEvent(containerId,
|
||||
message));
|
||||
LOG.warn("Exit code from container " + containerId + " is : " + exitCode);
|
||||
// 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
|
||||
// terminated/killed forcefully. In all other cases, log the
|
||||
// container-executor's output
|
||||
if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
|
||||
&& exitCode != ExitCode.TERMINATED.getExitCode()) {
|
||||
LOG.warn("Exception from container-launch with container ID: "
|
||||
+ containerId + " and exit code: " + exitCode , e);
|
||||
logOutput(shExec.getOutput());
|
||||
String diagnostics = "Exception from container-launch: \n"
|
||||
+ StringUtils.stringifyException(e) + "\n" + shExec.getOutput();
|
||||
container.handle(new ContainerDiagnosticsUpdateEvent(containerId,
|
||||
diagnostics));
|
||||
} else {
|
||||
container.handle(new ContainerDiagnosticsUpdateEvent(containerId,
|
||||
"Container killed on request. Exit code is " + exitCode));
|
||||
}
|
||||
return exitCode;
|
||||
} finally {
|
||||
; //
|
||||
|
|
|
@ -147,7 +147,8 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
shExec.execute();
|
||||
} catch (ExitCodeException e) {
|
||||
int exitCode = shExec.getExitCode();
|
||||
LOG.warn("Exit code from container is : " + exitCode);
|
||||
LOG.warn("Exit code from container executor initialization is : "
|
||||
+ exitCode, e);
|
||||
logOutput(shExec.getOutput());
|
||||
throw new IOException("Linux container executor not configured properly"
|
||||
+ " (error=" + exitCode + ")", e);
|
||||
|
@ -204,10 +205,11 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
}
|
||||
} catch (ExitCodeException e) {
|
||||
int exitCode = shExec.getExitCode();
|
||||
LOG.warn("Exit code from container is : " + exitCode);
|
||||
LOG.warn("Exit code from container " + locId + " startLocalizer is : "
|
||||
+ exitCode, e);
|
||||
logOutput(shExec.getOutput());
|
||||
throw new IOException("App initialization failed (" + exitCode +
|
||||
") with output: " + shExec.getOutput(), e);
|
||||
throw new IOException("Application " + appId + " initialization failed" +
|
||||
" (exitCode=" + exitCode + ") with output: " + shExec.getOutput(), e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -256,19 +258,18 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
return ExitCode.TERMINATED.getExitCode();
|
||||
}
|
||||
} catch (ExitCodeException e) {
|
||||
|
||||
if (null == shExec) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int exitCode = shExec.getExitCode();
|
||||
LOG.warn("Exit code from container is : " + exitCode);
|
||||
LOG.warn("Exit code from container " + containerId + " is : " + exitCode);
|
||||
// 143 (SIGTERM) and 137 (SIGKILL) exit codes means the container was
|
||||
// terminated/killed forcefully. In all other cases, log the
|
||||
// container-executor's output
|
||||
if (exitCode != ExitCode.FORCE_KILLED.getExitCode()
|
||||
&& exitCode != ExitCode.TERMINATED.getExitCode()) {
|
||||
LOG.warn("Exception from container-launch : ", e);
|
||||
LOG.warn("Exception from container-launch with container ID: "
|
||||
+ containerId + " and exit code: " + exitCode , e);
|
||||
logOutput(shExec.getOutput());
|
||||
String diagnostics = "Exception from container-launch: \n"
|
||||
+ StringUtils.stringifyException(e) + "\n" + shExec.getOutput();
|
||||
|
@ -310,9 +311,12 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
if (ret_code == ResultCode.INVALID_CONTAINER_PID.getValue()) {
|
||||
return false;
|
||||
}
|
||||
LOG.warn("Error in signalling container " + pid + " with " + signal
|
||||
+ "; exit = " + ret_code, e);
|
||||
logOutput(shExec.getOutput());
|
||||
throw new IOException("Problem signalling container " + pid + " with " +
|
||||
signal + "; exit = " + ret_code);
|
||||
throw new IOException("Problem signalling container " + pid + " with "
|
||||
+ signal + "; output: " + shExec.getOutput() + " and exitCode: "
|
||||
+ ret_code, e);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -346,13 +350,10 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
}
|
||||
} catch (IOException e) {
|
||||
int exitCode = shExec.getExitCode();
|
||||
LOG.warn("Exit code from container is : " + exitCode);
|
||||
if (exitCode != 0) {
|
||||
LOG.error("DeleteAsUser for " + dir.toUri().getPath()
|
||||
+ " returned with non-zero exit code" + exitCode);
|
||||
LOG.error("Output from LinuxContainerExecutor's deleteAsUser follows:");
|
||||
logOutput(shExec.getOutput());
|
||||
}
|
||||
LOG.error("DeleteAsUser for " + dir.toUri().getPath()
|
||||
+ " returned with exit code: " + exitCode, e);
|
||||
LOG.error("Output from LinuxContainerExecutor's deleteAsUser follows:");
|
||||
logOutput(shExec.getOutput());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -373,9 +374,10 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
shExec.execute();
|
||||
} catch (IOException e) {
|
||||
int ret_code = shExec.getExitCode();
|
||||
LOG.warn("Exception in LinuxContainerExecutor mountCgroups ", e);
|
||||
logOutput(shExec.getOutput());
|
||||
throw new IOException("Problem mounting cgroups " + cgroupKVs +
|
||||
"; exit code = " + ret_code, e);
|
||||
"; exit code = " + ret_code + " and output: " + shExec.getOutput(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,6 +61,7 @@ import org.apache.hadoop.yarn.server.nodemanager.Context;
|
|||
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
|
||||
|
@ -308,6 +309,7 @@ public class ContainerLaunch implements Callable<Integer> {
|
|||
* the process id is available.
|
||||
* @throws IOException
|
||||
*/
|
||||
@SuppressWarnings("unchecked") // dispatcher not typed
|
||||
public void cleanupContainer() throws IOException {
|
||||
ContainerId containerId = container.getContainerId();
|
||||
String containerIdStr = ConverterUtils.toString(containerId);
|
||||
|
@ -355,13 +357,17 @@ public class ContainerLaunch implements Callable<Integer> {
|
|||
+ " as user " + user
|
||||
+ " for container " + containerIdStr
|
||||
+ ", result=" + (result? "success" : "failed"));
|
||||
new DelayedProcessKiller(user,
|
||||
new DelayedProcessKiller(container, user,
|
||||
processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.warn("Got error when trying to cleanup container " + containerIdStr
|
||||
+ ", error=" + e.getMessage());
|
||||
String message =
|
||||
"Exception when trying to cleanup container " + containerIdStr
|
||||
+ ": " + StringUtils.stringifyException(e);
|
||||
LOG.warn(message);
|
||||
dispatcher.getEventHandler().handle(
|
||||
new ContainerDiagnosticsUpdateEvent(containerId, message));
|
||||
} finally {
|
||||
// cleanup pid file if present
|
||||
if (pidFilePath != null) {
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.util.Shell.ExitCodeException;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest;
|
||||
|
@ -98,7 +99,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
|
|||
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
|
||||
String timeoutCommand = Shell.WINDOWS ? "@echo \"hello\"" :
|
||||
"echo \"hello\"";
|
||||
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||
FileUtil.setExecutable(shellFile, true);
|
||||
writer.println(timeoutCommand);
|
||||
writer.close();
|
||||
|
@ -132,7 +133,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
|
|||
assertEquals(shexc.getExitCode(), 0);
|
||||
assert(shexc.getOutput().contains("hello"));
|
||||
|
||||
symLinkFile = new File(tmpDir, badSymlink);
|
||||
symLinkFile = new File(tmpDir, badSymlink);
|
||||
}
|
||||
finally {
|
||||
// cleanup
|
||||
|
@ -151,6 +152,173 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
|
|||
}
|
||||
}
|
||||
|
||||
// test the diagnostics are generated
|
||||
@Test (timeout = 20000)
|
||||
public void testInvalidSymlinkDiagnostics() throws IOException {
|
||||
|
||||
File shellFile = null;
|
||||
File tempFile = null;
|
||||
String symLink = Shell.WINDOWS ? "test.cmd" :
|
||||
"test";
|
||||
File symLinkFile = null;
|
||||
|
||||
try {
|
||||
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
|
||||
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
|
||||
String timeoutCommand = Shell.WINDOWS ? "@echo \"hello\"" :
|
||||
"echo \"hello\"";
|
||||
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||
FileUtil.setExecutable(shellFile, true);
|
||||
writer.println(timeoutCommand);
|
||||
writer.close();
|
||||
|
||||
Map<Path, List<String>> resources =
|
||||
new HashMap<Path, List<String>>();
|
||||
//This is an invalid path and should throw exception because of No such file.
|
||||
Path invalidPath = new Path(shellFile.getAbsolutePath()+"randomPath");
|
||||
resources.put(invalidPath, Arrays.asList(symLink));
|
||||
FileOutputStream fos = new FileOutputStream(tempFile);
|
||||
|
||||
Map<String, String> env = new HashMap<String, String>();
|
||||
List<String> commands = new ArrayList<String>();
|
||||
if (Shell.WINDOWS) {
|
||||
commands.add("cmd");
|
||||
commands.add("/c");
|
||||
commands.add("\"" + symLink + "\"");
|
||||
} else {
|
||||
commands.add("/bin/sh ./\\\"" + symLink + "\\\"");
|
||||
}
|
||||
ContainerLaunch.writeLaunchEnv(fos, env, resources, commands);
|
||||
fos.flush();
|
||||
fos.close();
|
||||
FileUtil.setExecutable(tempFile, true);
|
||||
|
||||
Shell.ShellCommandExecutor shexc
|
||||
= new Shell.ShellCommandExecutor(new String[]{tempFile.getAbsolutePath()}, tmpDir);
|
||||
String diagnostics = null;
|
||||
try {
|
||||
shexc.execute();
|
||||
Assert.fail("Should catch exception");
|
||||
} catch(ExitCodeException e){
|
||||
diagnostics = e.getMessage();
|
||||
}
|
||||
Assert.assertNotNull(diagnostics);
|
||||
Assert.assertTrue(shexc.getExitCode() != 0);
|
||||
symLinkFile = new File(tmpDir, symLink);
|
||||
}
|
||||
finally {
|
||||
// cleanup
|
||||
if (shellFile != null
|
||||
&& shellFile.exists()) {
|
||||
shellFile.delete();
|
||||
}
|
||||
if (tempFile != null
|
||||
&& tempFile.exists()) {
|
||||
tempFile.delete();
|
||||
}
|
||||
if (symLinkFile != null
|
||||
&& symLinkFile.exists()) {
|
||||
symLinkFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test (timeout = 20000)
|
||||
public void testInvalidEnvSyntaxDiagnostics() throws IOException {
|
||||
|
||||
File shellFile = null;
|
||||
try {
|
||||
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
|
||||
String timeoutCommand = Shell.WINDOWS ? "@echo \"hello\"" :
|
||||
"echo \"hello\"";
|
||||
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||
FileUtil.setExecutable(shellFile, true);
|
||||
writer.println(timeoutCommand);
|
||||
writer.close();
|
||||
Map<Path, List<String>> resources =
|
||||
new HashMap<Path, List<String>>();
|
||||
FileOutputStream fos = new FileOutputStream(shellFile);
|
||||
|
||||
Map<String, String> env = new HashMap<String, String>();
|
||||
// invalid env
|
||||
env.put(
|
||||
"APPLICATION_WORKFLOW_CONTEXT", "{\"workflowId\":\"609f91c5cd83\"," +
|
||||
"\"workflowName\":\"\n\ninsert table " +
|
||||
"\npartition (cd_education_status)\nselect cd_demo_sk, cd_gender, " );
|
||||
List<String> commands = new ArrayList<String>();
|
||||
ContainerLaunch.writeLaunchEnv(fos, env, resources, commands);
|
||||
fos.flush();
|
||||
fos.close();
|
||||
|
||||
Shell.ShellCommandExecutor shexc
|
||||
= new Shell.ShellCommandExecutor(new String[]{shellFile.getAbsolutePath()}, tmpDir);
|
||||
String diagnostics = null;
|
||||
try {
|
||||
shexc.execute();
|
||||
Assert.fail("Should catch exception");
|
||||
} catch(ExitCodeException e){
|
||||
diagnostics = e.getMessage();
|
||||
}
|
||||
Assert.assertTrue(diagnostics.contains("command not found"));
|
||||
Assert.assertTrue(shexc.getExitCode() != 0);
|
||||
}
|
||||
finally {
|
||||
// cleanup
|
||||
if (shellFile != null
|
||||
&& shellFile.exists()) {
|
||||
shellFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test (timeout = 20000)
|
||||
public void testContainerLaunchStdoutAndStderrDiagnostics() throws IOException {
|
||||
|
||||
File shellFile = null;
|
||||
try {
|
||||
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
|
||||
// echo "hello" to stdout and "error" to stderr and exit code with 2;
|
||||
String command = Shell.WINDOWS ? "@echo \"hello\"; @echo \"error\" 1>&2; exit 2;" :
|
||||
"echo \"hello\"; echo \"error\" 1>&2; exit 2;";
|
||||
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
|
||||
FileUtil.setExecutable(shellFile, true);
|
||||
writer.println(command);
|
||||
writer.close();
|
||||
Map<Path, List<String>> resources =
|
||||
new HashMap<Path, List<String>>();
|
||||
FileOutputStream fos = new FileOutputStream(shellFile);
|
||||
|
||||
Map<String, String> env = new HashMap<String, String>();
|
||||
List<String> commands = new ArrayList<String>();
|
||||
commands.add(command);
|
||||
ContainerLaunch.writeLaunchEnv(fos, env, resources, commands);
|
||||
fos.flush();
|
||||
fos.close();
|
||||
|
||||
Shell.ShellCommandExecutor shexc
|
||||
= new Shell.ShellCommandExecutor(new String[]{shellFile.getAbsolutePath()}, tmpDir);
|
||||
String diagnostics = null;
|
||||
try {
|
||||
shexc.execute();
|
||||
Assert.fail("Should catch exception");
|
||||
} catch(ExitCodeException e){
|
||||
diagnostics = e.getMessage();
|
||||
}
|
||||
// test stderr
|
||||
Assert.assertTrue(diagnostics.contains("error"));
|
||||
// test stdout
|
||||
Assert.assertTrue(shexc.getOutput().contains("hello"));
|
||||
Assert.assertTrue(shexc.getExitCode() == 2);
|
||||
}
|
||||
finally {
|
||||
// cleanup
|
||||
if (shellFile != null
|
||||
&& shellFile.exists()) {
|
||||
shellFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* See if environment variable is forwarded using sanitizeEnv.
|
||||
* @throws Exception
|
||||
|
|
Loading…
Reference in New Issue