From 8baf9369bbd2362182df5153a01760193ac424ec Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Thu, 24 Dec 2015 22:19:30 -0800 Subject: [PATCH] YARN-2934. Improve handling of container's stderr. (Naganarasimha G R via gera) (cherry picked from commit 2c17b81569c05b5f375d0ddad99d43cb4e3b23b1) --- hadoop-yarn-project/CHANGES.txt | 3 + .../hadoop/yarn/conf/YarnConfiguration.java | 13 +- .../src/main/resources/yarn-default.xml | 23 +++ .../launcher/ContainerLaunch.java | 95 ++++++++++- .../launcher/TestContainerLaunch.java | 150 +++++++++++++++++- 5 files changed, 272 insertions(+), 12 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 8b0a92047a3..4cb1f47e7d7 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -28,6 +28,9 @@ Release 2.9.0 - UNRELEASED YARN-4156. TestAMRestart#testAMBlacklistPreventsRestartOnSameNode assumes CapacityScheduler. (Anubhav Dhoot via kasha) + YARN-2934. Improve handling of container's stderr. + (Naganarasimha G R via gera) + Release 2.8.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 66a8110ff41..6a7857ac395 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1332,7 +1332,18 @@ public class YarnConfiguration extends Configuration { public static final String NM_USER_HOME_DIR = NM_PREFIX + "user-home-dir"; - + + public static final String NM_CONTAINER_STDERR_PATTERN = + NM_PREFIX + "container.stderr.pattern"; + + public static final String DEFAULT_NM_CONTAINER_STDERR_PATTERN = + "{*stderr*,*STDERR*}"; + + public static final String NM_CONTAINER_STDERR_BYTES = + NM_PREFIX + "container.stderr.tail.bytes"; + + public static final long DEFAULT_NM_CONTAINER_STDERR_BYTES = 4 * 1024; + /**The kerberos principal to be used for spnego filter for NM.*/ public static final String NM_WEBAPP_SPNEGO_USER_NAME_KEY = NM_PREFIX + "webapp.spnego-principal"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 13a7b1b2f7d..49cced6d807 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -2459,6 +2459,29 @@ org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor + + + Error filename pattern, to identify the file in the container's + Log directory which contain the container's error log. As error file + redirection is done by client/AM and yarn will not be aware of the error + file name. YARN uses this pattern to identify the error file and tail + the error log as diagnostics when the container execution returns non zero + value. Filename patterns are case sensitive and should match the + specifications of FileSystem.globStatus(Path) api. If multiple filenames + matches the pattern, first file matching the pattern will be picked. + + yarn.nodemanager.container.stderr.pattern + {*stderr*,*STDERR*} + + + + + Size of the container error file which needs to be tailed, in bytes. + + yarn.nodemanager.container.stderr.tail.bytes + 4096 + + Enable/disable blacklisting of hosts for AM based on AM failures on those diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java index b7a97d6b1c4..6371b218938 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java @@ -26,6 +26,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; @@ -38,7 +39,10 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.Path; @@ -61,6 +65,7 @@ import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -71,7 +76,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; -import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext; import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerStartContext; import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader; @@ -171,6 +175,7 @@ public class ContainerLaunch implements Callable { return 0; } + Path containerLogDir; try { localResources = container.getLocalizedResources(); if (localResources == null) { @@ -186,7 +191,7 @@ public class ContainerLaunch implements Callable { String appIdStr = app.getAppId().toString(); String relativeContainerLogDir = ContainerLaunch .getRelativeContainerLogDir(appIdStr, containerIdStr); - Path containerLogDir = + containerLogDir = dirsHandler.getLogPathForWrite(relativeContainerLogDir, false); for (String str : command) { // TODO: Should we instead work via symlinks without this grammar? @@ -334,6 +339,11 @@ public class ContainerLaunch implements Callable { LOG.debug("Container " + containerIdStr + " completed with exit code " + ret); } + + StringBuilder diagnosticInfo = + new StringBuilder("Container exited with a non-zero exit code "); + diagnosticInfo.append(ret); + diagnosticInfo.append(". "); if (ret == ExitCode.FORCE_KILLED.getExitCode() || ret == ExitCode.TERMINATED.getExitCode()) { // If the process was killed, Send container_cleanedup_after_kill and @@ -341,16 +351,13 @@ public class ContainerLaunch implements Callable { dispatcher.getEventHandler().handle( new ContainerExitEvent(containerID, ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret, - "Container exited with a non-zero exit code " + ret)); + diagnosticInfo.toString())); return ret; } if (ret != 0) { - LOG.warn("Container exited with a non-zero exit code " + ret); - this.dispatcher.getEventHandler().handle(new ContainerExitEvent( - containerID, - ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret, - "Container exited with a non-zero exit code " + ret)); + handleContainerExitWithFailure(containerID, ret, containerLogDir, + diagnosticInfo); return ret; } @@ -361,6 +368,78 @@ public class ContainerLaunch implements Callable { return 0; } + /** + * Tries to tail and fetch TAIL_SIZE_IN_BYTES of data from the error log. + * ErrorLog filename is not fixed and depends upon app, hence file name + * pattern is used. + * @param containerID + * @param ret + * @param containerLogDir + * @param diagnosticInfo + */ + @SuppressWarnings("unchecked") + private void handleContainerExitWithFailure(ContainerId containerID, int ret, + Path containerLogDir, StringBuilder diagnosticInfo) { + LOG.warn(diagnosticInfo); + + String errorFileNamePattern = + conf.get(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN, + YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_PATTERN); + FSDataInputStream errorFileIS = null; + try { + FileSystem fileSystem = FileSystem.getLocal(conf).getRaw(); + FileStatus[] errorFileStatuses = fileSystem + .globStatus(new Path(containerLogDir, errorFileNamePattern)); + if (errorFileStatuses != null && errorFileStatuses.length != 0) { + long tailSizeInBytes = + conf.getLong(YarnConfiguration.NM_CONTAINER_STDERR_BYTES, + YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES); + Path errorFile = errorFileStatuses[0].getPath(); + long fileSize = errorFileStatuses[0].getLen(); + + // if more than one file matches the stderr pattern, take the latest + // modified file, and also append the file names in the diagnosticInfo + if (errorFileStatuses.length > 1) { + String[] errorFileNames = new String[errorFileStatuses.length]; + long latestModifiedTime = errorFileStatuses[0].getModificationTime(); + errorFileNames[0] = errorFileStatuses[0].getPath().getName(); + for (int i = 1; i < errorFileStatuses.length; i++) { + errorFileNames[i] = errorFileStatuses[i].getPath().getName(); + if (errorFileStatuses[i] + .getModificationTime() > latestModifiedTime) { + latestModifiedTime = errorFileStatuses[i].getModificationTime(); + errorFile = errorFileStatuses[i].getPath(); + fileSize = errorFileStatuses[i].getLen(); + } + } + diagnosticInfo.append("Error files: ") + .append(StringUtils.join(", ", errorFileNames)).append(".\n"); + } + + long startPosition = + (fileSize < tailSizeInBytes) ? 0 : fileSize - tailSizeInBytes; + int bufferSize = + (int) ((fileSize < tailSizeInBytes) ? fileSize : tailSizeInBytes); + byte[] tailBuffer = new byte[bufferSize]; + errorFileIS = fileSystem.open(errorFile); + errorFileIS.readFully(startPosition, tailBuffer); + + diagnosticInfo.append("Last ").append(tailSizeInBytes) + .append(" bytes of ").append(errorFile.getName()).append(" :\n") + .append(new String(tailBuffer, StandardCharsets.UTF_8)); + } + } catch (IOException e) { + LOG.error("Failed to get tail of the container's error log file", e); + } finally { + IOUtils.cleanup(LOG, errorFileIS); + } + + this.dispatcher.getEventHandler() + .handle(new ContainerExitEvent(containerID, + ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret, + diagnosticInfo.toString())); + } + protected String getPidFileSubpath(String appIdStr, String containerIdStr) { return getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + String.format(ContainerLaunch.PID_FILE_NAME_FMT, containerIdStr); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java index 502ecdec666..aa41de70a17 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java @@ -21,7 +21,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import static org.junit.Assert.fail; -import static org.junit.matchers.JUnitMatchers.containsString; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -48,6 +47,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnsupportedFileSystemException; +import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.token.SecretManager.InvalidToken; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell.ExitCodeException; @@ -81,19 +81,21 @@ import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode; -import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.ShellScriptBuilder; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; -import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.Apps; import org.apache.hadoop.yarn.util.AuxiliaryServiceHelper; @@ -108,6 +110,7 @@ import org.junit.Test; public class TestContainerLaunch extends BaseContainerManagerTest { + private static final String INVALID_JAVA_HOME = "/no/jvm/here"; protected Context distContext = new NMContext(new NMContainerTokenSecretManager( conf), new NMTokenSecretManagerInNM(), null, new ApplicationACLsManager(conf), new NMNullStateStoreService()) { @@ -492,6 +495,147 @@ public class TestContainerLaunch extends BaseContainerManagerTest { } + @Test + public void testErrorLogOnContainerExit() throws Exception { + verifyTailErrorLogOnContainerExit(new Configuration(), "/stderr", false); + } + + @Test + public void testErrorLogOnContainerExitForCase() throws Exception { + verifyTailErrorLogOnContainerExit(new Configuration(), "/STDERR.log", + false); + } + + @Test + public void testErrorLogOnContainerExitForExt() throws Exception { + verifyTailErrorLogOnContainerExit(new Configuration(), "/AppMaster.stderr", + false); + } + + @Test + public void testErrorLogOnContainerExitWithCustomPattern() throws Exception { + Configuration conf = new Configuration(); + conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN, + "{*stderr*,*log*}"); + verifyTailErrorLogOnContainerExit(conf, "/error.log", false); + } + + @Test + public void testErrorLogOnContainerExitWithMultipleFiles() throws Exception { + Configuration conf = new Configuration(); + conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN, + "{*stderr*,*stdout*}"); + verifyTailErrorLogOnContainerExit(conf, "/stderr.log", true); + } + + private void verifyTailErrorLogOnContainerExit(Configuration conf, + String errorFileName, boolean testForMultipleErrFiles) throws Exception { + Container container = mock(Container.class); + ApplicationId appId = + ApplicationId.newInstance(System.currentTimeMillis(), 1); + ContainerId containerId = ContainerId + .newContainerId(ApplicationAttemptId.newInstance(appId, 1), 1); + when(container.getContainerId()).thenReturn(containerId); + when(container.getUser()).thenReturn("test"); + String relativeContainerLogDir = ContainerLaunch.getRelativeContainerLogDir( + appId.toString(), ConverterUtils.toString(containerId)); + Path containerLogDir = + dirsHandler.getLogPathForWrite(relativeContainerLogDir, false); + + ContainerLaunchContext clc = mock(ContainerLaunchContext.class); + List invalidCommand = new ArrayList(); + invalidCommand.add("$JAVA_HOME/bin/java"); + invalidCommand.add("-Djava.io.tmpdir=$PWD/tmp"); + invalidCommand.add("-Dlog4j.configuration=container-log4j.properties"); + invalidCommand.add("-Dyarn.app.container.log.dir=" + containerLogDir); + invalidCommand.add("-Dyarn.app.container.log.filesize=0"); + invalidCommand.add("-Dhadoop.root.logger=INFO,CLA"); + invalidCommand.add("-Dhadoop.root.logfile=syslog"); + invalidCommand.add("-Xmx1024m"); + invalidCommand.add("org.apache.hadoop.mapreduce.v2.app.MRAppMaster"); + invalidCommand.add("1>" + containerLogDir + "/stdout"); + invalidCommand.add("2>" + containerLogDir + errorFileName); + when(clc.getCommands()).thenReturn(invalidCommand); + + Map userSetEnv = new HashMap(); + userSetEnv.put(Environment.CONTAINER_ID.name(), "user_set_container_id"); + userSetEnv.put("JAVA_HOME", INVALID_JAVA_HOME); + userSetEnv.put(Environment.NM_HOST.name(), "user_set_NM_HOST"); + userSetEnv.put(Environment.NM_PORT.name(), "user_set_NM_PORT"); + userSetEnv.put(Environment.NM_HTTP_PORT.name(), "user_set_NM_HTTP_PORT"); + userSetEnv.put(Environment.LOCAL_DIRS.name(), "user_set_LOCAL_DIR"); + userSetEnv.put(Environment.USER.key(), + "user_set_" + Environment.USER.key()); + userSetEnv.put(Environment.LOGNAME.name(), "user_set_LOGNAME"); + userSetEnv.put(Environment.PWD.name(), "user_set_PWD"); + userSetEnv.put(Environment.HOME.name(), "user_set_HOME"); + userSetEnv.put(Environment.CLASSPATH.name(), "APATH"); + when(clc.getEnvironment()).thenReturn(userSetEnv); + when(container.getLaunchContext()).thenReturn(clc); + + when(container.getLocalizedResources()) + .thenReturn(Collections.> emptyMap()); + Dispatcher dispatcher = mock(Dispatcher.class); + + @SuppressWarnings("rawtypes") + ContainerExitHandler eventHandler = + new ContainerExitHandler(testForMultipleErrFiles); + when(dispatcher.getEventHandler()).thenReturn(eventHandler); + + Application app = mock(Application.class); + when(app.getAppId()).thenReturn(appId); + when(app.getUser()).thenReturn("test"); + + Credentials creds = mock(Credentials.class); + when(container.getCredentials()).thenReturn(creds); + + ((NMContext) context).setNodeId(NodeId.newInstance("127.0.0.1", HTTP_PORT)); + + ContainerLaunch launch = new ContainerLaunch(context, conf, dispatcher, + exec, app, container, dirsHandler, containerManager); + launch.call(); + Assert.assertTrue("ContainerExitEvent should have occured", + eventHandler.isContainerExitEventOccured()); + } + + private static class ContainerExitHandler + implements EventHandler { + private boolean testForMultiFile; + + ContainerExitHandler(boolean testForMultiFile) { + this.testForMultiFile = testForMultiFile; + } + + boolean containerExitEventOccured = false; + + public boolean isContainerExitEventOccured() { + return containerExitEventOccured; + } + + public void handle(ContainerEvent event) { + if (event instanceof ContainerExitEvent) { + containerExitEventOccured = true; + ContainerExitEvent exitEvent = (ContainerExitEvent) event; + Assert.assertEquals(ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, + exitEvent.getType()); + LOG.info("Diagnostic Info : " + exitEvent.getDiagnosticInfo()); + if (testForMultiFile) { + Assert.assertTrue("Should contain the Multi file information", + exitEvent.getDiagnosticInfo().contains("Error files: ")); + } + Assert.assertTrue( + "Should contain the error Log message with tail size info", + exitEvent.getDiagnosticInfo() + .contains("Last " + + YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES + + " bytes of")); + Assert.assertTrue("Should contain contents of error Log", + exitEvent.getDiagnosticInfo().contains( + INVALID_JAVA_HOME + "/bin/java: No such file or directory")); + } + } + } + private static List getJarManifestClasspath(String path) throws Exception { List classpath = new ArrayList();