From 9f979107c22f365633811a38363d0dbd34573374 Mon Sep 17 00:00:00 2001 From: Wangda Tan Date: Mon, 14 Dec 2015 11:13:22 -0800 Subject: [PATCH] YARN-4309. Add container launch related debug information to container logs when a container fails. (Varun Vasudev via wangda) (cherry picked from commit dfcbbddb0963c89c0455d41223427165b9f9e537) --- hadoop-yarn-project/CHANGES.txt | 3 + .../hadoop/yarn/conf/YarnConfiguration.java | 5 + .../src/main/resources/yarn-default.xml | 12 +++ .../server/nodemanager/ContainerExecutor.java | 24 ++++- .../nodemanager/DockerContainerExecutor.java | 11 +- .../launcher/ContainerLaunch.java | 74 ++++++++++++- .../launcher/TestContainerLaunch.java | 102 ++++++++++++++++-- 7 files changed, 217 insertions(+), 14 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index afdd93f2c03..bde35e4e9ea 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -561,6 +561,9 @@ Release 2.8.0 - UNRELEASED YARN-3946. Update exact reason as to why a submitted app is in ACCEPTED state to app's diagnostic message. (Naganarasimha G R via wangda) + YARN-4309. Add container launch related debug information to container logs + when a container fails. (Varun Vasudev via wangda) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 521bb3d915e..35ad12f340d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -92,6 +92,11 @@ private static void addDeprecatedKeys() { /** Delay before deleting resource to ease debugging of NM issues */ public static final String DEBUG_NM_DELETE_DELAY_SEC = YarnConfiguration.NM_PREFIX + "delete.debug-delay-sec"; + + public static final String NM_LOG_CONTAINER_DEBUG_INFO = + YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled"; + + public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = false; //////////////////////////////// // IPC Configs diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 5b71b1ff6ad..13a7b1b2f7d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1127,6 +1127,18 @@ logs + + Generate additional logs about container launches. + Currently, this creates a copy of the launch script and lists the + directory contents of the container work dir. When listing directory + contents, we follow symlinks to a max-depth of 5(including symlinks + which point to outside the container work dir) which may lead to a + slowness in launching containers. + + yarn.nodemanager.log-container-debug-info.enabled + false + + Amount of physical memory, in MB, that can be allocated for containers. If set to -1 and diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java index 6d75a1cfc64..8c74bf5831f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java @@ -34,6 +34,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -65,6 +66,8 @@ public abstract class ContainerExecutor implements Configurable { final public static FsPermission TASK_LAUNCH_SCRIPT_PERMISSION = FsPermission.createImmutable((short) 0700); + public static final String DIRECTORY_CONTENTS = "directory.info"; + private Configuration conf; private ConcurrentMap pidFiles = @@ -241,11 +244,22 @@ public int reacquireContainer(ContainerReacquisitionContext ctx) * @param resources The resources which have been localized for this container * Symlinks will be created to these localized resources * @param command The command that will be run. + * @param logDir The log dir to copy debugging information to * @throws IOException if any errors happened writing to the OutputStream, * while creating symlinks */ public void writeLaunchEnv(OutputStream out, Map environment, - Map> resources, List command) throws IOException{ + Map> resources, List command, Path logDir) + throws IOException { + this.writeLaunchEnv(out, environment, resources, command, logDir, + ContainerLaunch.CONTAINER_SCRIPT); + } + + @VisibleForTesting + public void writeLaunchEnv(OutputStream out, + Map environment, Map> resources, + List command, Path logDir, String outFilename) + throws IOException { ContainerLaunch.ShellScriptBuilder sb = ContainerLaunch.ShellScriptBuilder.create(); Set whitelist = new HashSet(); @@ -272,6 +286,14 @@ public void writeLaunchEnv(OutputStream out, Map environment, } } + // dump debugging information if configured + if (getConf() != null && getConf().getBoolean( + YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, + YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) { + sb.copyDebugInformation(new Path(outFilename), new Path(logDir, outFilename)); + sb.listDebugInformation(new Path(logDir, DIRECTORY_CONTENTS)); + } + sb.command(command); PrintStream pout = null; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java index 96f8b5110b5..b089947dfa2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java @@ -329,7 +329,7 @@ public int launchContainer(ContainerStartContext ctx) throws IOException { * the docker image and write them out to an OutputStream. */ public void writeLaunchEnv(OutputStream out, Map environment, - Map> resources, List command) + Map> resources, List command, Path logDir) throws IOException { ContainerLaunch.ShellScriptBuilder sb = ContainerLaunch.ShellScriptBuilder.create(); @@ -358,6 +358,15 @@ public void writeLaunchEnv(OutputStream out, Map environment, } } + // dump debugging information if configured + if (getConf() != null && getConf().getBoolean( + YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, + YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) { + sb.copyDebugInformation(new Path(ContainerLaunch.CONTAINER_SCRIPT), + new Path(logDir, ContainerLaunch.CONTAINER_SCRIPT)); + sb.listDebugInformation(new Path(logDir, DIRECTORY_CONTENTS)); + } + sb.command(command); PrintStream pout = null; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java index fc8615bfbb8..b7a97d6b1c4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java @@ -272,7 +272,8 @@ public Integer call() { // Write out the environment exec.writeLaunchEnv(containerScriptOutStream, environment, - localResources, launchContext.getCommands()); + localResources, launchContext.getCommands(), + new Path(containerLogDirs.get(0))); // /////////// End of writing out container-script @@ -641,6 +642,28 @@ public final void symlink(Path src, Path dst) throws IOException { link(src, dst); } + /** + * Method to copy files that are useful for debugging container failures. + * This method will be called by ContainerExecutor when setting up the + * container launch script. The method should take care to make sure files + * are read-able by the yarn user if the files are to undergo + * log-aggregation. + * @param src path to the source file + * @param dst path to the destination file - should be absolute + * @throws IOException + */ + public abstract void copyDebugInformation(Path src, Path dst) + throws IOException; + + /** + * Method to dump debug information to the a target file. This method will + * be called by ContainerExecutor when setting up the container launch + * script. + * @param output the file to which debug information is to be written + * @throws IOException + */ + public abstract void listDebugInformation(Path output) throws IOException; + @Override public String toString() { return sb.toString(); @@ -704,6 +727,36 @@ protected void mkdir(Path path) { line("mkdir -p ", path.toString()); errorCheck(); } + + @Override + public void copyDebugInformation(Path src, Path dest) throws IOException { + line("# Creating copy of launch script"); + line("cp \"", src.toUri().getPath(), "\" \"", dest.toUri().getPath(), + "\""); + // set permissions to 640 because we need to be able to run + // log aggregation in secure mode as well + if(dest.isAbsolute()) { + line("chmod 640 \"", dest.toUri().getPath(), "\""); + } + } + + @Override + public void listDebugInformation(Path output) throws IOException { + line("# Determining directory contents"); + line("echo \"ls -l:\" 1>\"", output.toString(), "\""); + line("ls -l 1>>\"", output.toString(), "\""); + + // don't run error check because if there are loops + // find will exit with an error causing container launch to fail + // find will follow symlinks outside the work dir if such sylimks exist + // (like public/app local resources) + line("echo \"find -L . -maxdepth 5 -ls:\" 1>>\"", output.toString(), + "\""); + line("find -L . -maxdepth 5 -ls 1>>\"", output.toString(), "\""); + line("echo \"broken symlinks(find -L . -maxdepth 5 -type l -ls):\" 1>>\"", + output.toString(), "\""); + line("find -L . -maxdepth 5 -type l -ls 1>>\"", output.toString(), "\""); + } } private static final class WindowsShellScriptBuilder @@ -757,6 +810,25 @@ protected void mkdir(Path path) throws IOException { path.toString(), path.toString())); errorCheck(); } + + @Override + public void copyDebugInformation(Path src, Path dest) + throws IOException { + // no need to worry about permissions - in secure mode + // WindowsSecureContainerExecutor will set permissions + // to allow NM to read the file + line("rem Creating copy of launch script"); + lineWithLenCheck(String.format("copy \"%s\" \"%s\"", src.toString(), + dest.toString())); + } + + @Override + public void listDebugInformation(Path output) throws IOException { + line("rem Determining directory contents"); + lineWithLenCheck( + String.format("@echo \"dir:\" > \"%s\"", output.toString())); + lineWithLenCheck(String.format("dir >> \"%s\"", output.toString())); + } } private static void putEnvIfNotNull( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java index 0abae2b09d1..502ecdec666 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java @@ -100,6 +100,7 @@ import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin; import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin; +import org.hamcrest.CoreMatchers; import org.junit.Assert; import org.junit.Assume; import org.junit.Before; @@ -166,7 +167,9 @@ public void testSpecialCharSymlinks() throws IOException { commands.add("/bin/sh ./\\\"" + badSymlink + "\\\""); } - new DefaultContainerExecutor().writeLaunchEnv(fos, env, resources, commands); + new DefaultContainerExecutor() + .writeLaunchEnv(fos, env, resources, commands, + new Path(localLogDir.getAbsolutePath()), tempFile.getName()); fos.flush(); fos.close(); FileUtil.setExecutable(tempFile, true); @@ -233,7 +236,9 @@ public void testInvalidSymlinkDiagnostics() throws IOException { } else { commands.add("/bin/sh ./\\\"" + symLink + "\\\""); } - new DefaultContainerExecutor().writeLaunchEnv(fos, env, resources, commands); + new DefaultContainerExecutor() + .writeLaunchEnv(fos, env, resources, commands, + new Path(localLogDir.getAbsolutePath())); fos.flush(); fos.close(); FileUtil.setExecutable(tempFile, true); @@ -286,7 +291,9 @@ public void testInvalidEnvSyntaxDiagnostics() throws IOException { "\"workflowName\":\"\n\ninsert table " + "\npartition (cd_education_status)\nselect cd_demo_sk, cd_gender, " ); List commands = new ArrayList(); - new DefaultContainerExecutor().writeLaunchEnv(fos, env, resources, commands); + new DefaultContainerExecutor() + .writeLaunchEnv(fos, env, resources, commands, + new Path(localLogDir.getAbsolutePath())); fos.flush(); fos.close(); @@ -364,7 +371,8 @@ public void testContainerLaunchStdoutAndStderrDiagnostics() throws IOException { List commands = new ArrayList(); commands.add(command); ContainerExecutor exec = new DefaultContainerExecutor(); - exec.writeLaunchEnv(fos, env, resources, commands); + exec.writeLaunchEnv(fos, env, resources, commands, + new Path(localLogDir.getAbsolutePath())); fos.flush(); fos.close(); @@ -994,7 +1002,7 @@ public void testWindowsShellScriptBuilderCommand() throws IOException { "X", Shell.WINDOWS_MAX_SHELL_LENGTH -callCmd.length() + 1))); fail("longCommand was expected to throw"); } catch(IOException e) { - assertThat(e.getMessage(), containsString(expectedMessage)); + assertThat(e.getMessage(), CoreMatchers.containsString(expectedMessage)); } // Composite tests, from parts: less, exact and + @@ -1016,7 +1024,7 @@ public void testWindowsShellScriptBuilderCommand() throws IOException { org.apache.commons.lang.StringUtils.repeat("X", 2048 - callCmd.length()))); fail("long commands was expected to throw"); } catch(IOException e) { - assertThat(e.getMessage(), containsString(expectedMessage)); + assertThat(e.getMessage(), CoreMatchers.containsString(expectedMessage)); } } @@ -1039,7 +1047,7 @@ public void testWindowsShellScriptBuilderEnv() throws IOException { "A", Shell.WINDOWS_MAX_SHELL_LENGTH - ("@set somekey=").length()) + 1); fail("long env was expected to throw"); } catch(IOException e) { - assertThat(e.getMessage(), containsString(expectedMessage)); + assertThat(e.getMessage(), CoreMatchers.containsString(expectedMessage)); } } @@ -1057,14 +1065,14 @@ public void testWindowsShellScriptBuilderMkdir() throws IOException { // test mkdir builder.mkdir(new Path(org.apache.commons.lang.StringUtils.repeat("A", 1024))); - builder.mkdir(new Path(org.apache.commons.lang.StringUtils.repeat( - "E", (Shell.WINDOWS_MAX_SHELL_LENGTH - mkDirCmd.length())/2))); + builder.mkdir(new Path(org.apache.commons.lang.StringUtils.repeat("E", + (Shell.WINDOWS_MAX_SHELL_LENGTH - mkDirCmd.length()) / 2))); try { builder.mkdir(new Path(org.apache.commons.lang.StringUtils.repeat( "X", (Shell.WINDOWS_MAX_SHELL_LENGTH - mkDirCmd.length())/2 +1))); fail("long mkdir was expected to throw"); } catch(IOException e) { - assertThat(e.getMessage(), containsString(expectedMessage)); + assertThat(e.getMessage(), CoreMatchers.containsString(expectedMessage)); } } @@ -1095,7 +1103,7 @@ public void testWindowsShellScriptBuilderLink() throws IOException { "Y", (Shell.WINDOWS_MAX_SHELL_LENGTH - linkCmd.length())/2) + 1)); fail("long link was expected to throw"); } catch(IOException e) { - assertThat(e.getMessage(), containsString(expectedMessage)); + assertThat(e.getMessage(), CoreMatchers.containsString(expectedMessage)); } } @@ -1208,4 +1216,76 @@ public void testKillProcessGroup() throws Exception { Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(), containerStatus.getExitStatus()); } + + @Test + public void testDebuggingInformation() throws IOException { + + File shellFile = null; + File tempFile = null; + Configuration conf = new YarnConfiguration(); + try { + shellFile = Shell.appendScriptExtension(tmpDir, "hello"); + tempFile = Shell.appendScriptExtension(tmpDir, "temp"); + String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : + "echo \"hello\""; + PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile)); + FileUtil.setExecutable(shellFile, true); + writer.println(testCommand); + writer.close(); + + Map> resources = new HashMap>(); + Map env = new HashMap(); + List commands = new ArrayList(); + if (Shell.WINDOWS) { + commands.add("cmd"); + commands.add("/c"); + commands.add("\"" + shellFile.getAbsolutePath() + "\""); + } else { + commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\""); + } + + boolean[] debugLogsExistArray = { false, true }; + for (boolean debugLogsExist : debugLogsExistArray) { + + conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, + debugLogsExist); + FileOutputStream fos = new FileOutputStream(tempFile); + ContainerExecutor exec = new DefaultContainerExecutor(); + exec.setConf(conf); + exec.writeLaunchEnv(fos, env, resources, commands, + new Path(localLogDir.getAbsolutePath()), tempFile.getName()); + fos.flush(); + fos.close(); + FileUtil.setExecutable(tempFile, true); + + Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor( + new String[] { tempFile.getAbsolutePath() }, tmpDir); + + shexc.execute(); + assertEquals(shexc.getExitCode(), 0); + File directorInfo = + new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS); + File scriptCopy = new File(localLogDir, tempFile.getName()); + + Assert.assertEquals("Directory info file missing", debugLogsExist, + directorInfo.exists()); + Assert.assertEquals("Copy of launch script missing", debugLogsExist, + scriptCopy.exists()); + if (debugLogsExist) { + Assert.assertTrue("Directory info file size is 0", + directorInfo.length() > 0); + Assert.assertTrue("Size of copy of launch script is 0", + scriptCopy.length() > 0); + } + } + } finally { + // cleanup + if (shellFile != null && shellFile.exists()) { + shellFile.delete(); + } + if (tempFile != null && tempFile.exists()) { + tempFile.delete(); + } + } + } }