diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 8b0a92047a3..4cb1f47e7d7 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -28,6 +28,9 @@ Release 2.9.0 - UNRELEASED
YARN-4156. TestAMRestart#testAMBlacklistPreventsRestartOnSameNode
assumes CapacityScheduler. (Anubhav Dhoot via kasha)
+ YARN-2934. Improve handling of container's stderr.
+ (Naganarasimha G R via gera)
+
Release 2.8.0 - UNRELEASED
INCOMPATIBLE CHANGES
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 66a8110ff41..6a7857ac395 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1332,7 +1332,18 @@ private static void addDeprecatedKeys() {
public static final String NM_USER_HOME_DIR =
NM_PREFIX + "user-home-dir";
-
+
+ public static final String NM_CONTAINER_STDERR_PATTERN =
+ NM_PREFIX + "container.stderr.pattern";
+
+ public static final String DEFAULT_NM_CONTAINER_STDERR_PATTERN =
+ "{*stderr*,*STDERR*}";
+
+ public static final String NM_CONTAINER_STDERR_BYTES =
+ NM_PREFIX + "container.stderr.tail.bytes";
+
+ public static final long DEFAULT_NM_CONTAINER_STDERR_BYTES = 4 * 1024;
+
/**The kerberos principal to be used for spnego filter for NM.*/
public static final String NM_WEBAPP_SPNEGO_USER_NAME_KEY =
NM_PREFIX + "webapp.spnego-principal";
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 13a7b1b2f7d..49cced6d807 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -2459,6 +2459,29 @@
org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor
+
+
+ Error filename pattern, to identify the file in the container's
+ Log directory which contain the container's error log. As error file
+ redirection is done by client/AM and yarn will not be aware of the error
+ file name. YARN uses this pattern to identify the error file and tail
+ the error log as diagnostics when the container execution returns non zero
+ value. Filename patterns are case sensitive and should match the
+ specifications of FileSystem.globStatus(Path) api. If multiple filenames
+ matches the pattern, first file matching the pattern will be picked.
+
+ yarn.nodemanager.container.stderr.pattern
+ {*stderr*,*STDERR*}
+
+
+
+
+ Size of the container error file which needs to be tailed, in bytes.
+
+ yarn.nodemanager.container.stderr.tail.bytes
+ 4096
+
+
Enable/disable blacklisting of hosts for AM based on AM failures on those
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
index b7a97d6b1c4..6371b218938 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
@@ -26,6 +26,7 @@
import java.io.IOException;
import java.io.PrintStream;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
@@ -38,7 +39,10 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.Path;
@@ -61,6 +65,7 @@
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -71,7 +76,6 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
-import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext;
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerStartContext;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
@@ -171,6 +175,7 @@ public Integer call() {
return 0;
}
+ Path containerLogDir;
try {
localResources = container.getLocalizedResources();
if (localResources == null) {
@@ -186,7 +191,7 @@ public Integer call() {
String appIdStr = app.getAppId().toString();
String relativeContainerLogDir = ContainerLaunch
.getRelativeContainerLogDir(appIdStr, containerIdStr);
- Path containerLogDir =
+ containerLogDir =
dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
for (String str : command) {
// TODO: Should we instead work via symlinks without this grammar?
@@ -334,6 +339,11 @@ public Integer call() {
LOG.debug("Container " + containerIdStr + " completed with exit code "
+ ret);
}
+
+ StringBuilder diagnosticInfo =
+ new StringBuilder("Container exited with a non-zero exit code ");
+ diagnosticInfo.append(ret);
+ diagnosticInfo.append(". ");
if (ret == ExitCode.FORCE_KILLED.getExitCode()
|| ret == ExitCode.TERMINATED.getExitCode()) {
// If the process was killed, Send container_cleanedup_after_kill and
@@ -341,16 +351,13 @@ public Integer call() {
dispatcher.getEventHandler().handle(
new ContainerExitEvent(containerID,
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret,
- "Container exited with a non-zero exit code " + ret));
+ diagnosticInfo.toString()));
return ret;
}
if (ret != 0) {
- LOG.warn("Container exited with a non-zero exit code " + ret);
- this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
- containerID,
- ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
- "Container exited with a non-zero exit code " + ret));
+ handleContainerExitWithFailure(containerID, ret, containerLogDir,
+ diagnosticInfo);
return ret;
}
@@ -361,6 +368,78 @@ public Integer call() {
return 0;
}
+ /**
+ * Tries to tail and fetch TAIL_SIZE_IN_BYTES of data from the error log.
+ * ErrorLog filename is not fixed and depends upon app, hence file name
+ * pattern is used.
+ * @param containerID
+ * @param ret
+ * @param containerLogDir
+ * @param diagnosticInfo
+ */
+ @SuppressWarnings("unchecked")
+ private void handleContainerExitWithFailure(ContainerId containerID, int ret,
+ Path containerLogDir, StringBuilder diagnosticInfo) {
+ LOG.warn(diagnosticInfo);
+
+ String errorFileNamePattern =
+ conf.get(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
+ YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_PATTERN);
+ FSDataInputStream errorFileIS = null;
+ try {
+ FileSystem fileSystem = FileSystem.getLocal(conf).getRaw();
+ FileStatus[] errorFileStatuses = fileSystem
+ .globStatus(new Path(containerLogDir, errorFileNamePattern));
+ if (errorFileStatuses != null && errorFileStatuses.length != 0) {
+ long tailSizeInBytes =
+ conf.getLong(YarnConfiguration.NM_CONTAINER_STDERR_BYTES,
+ YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES);
+ Path errorFile = errorFileStatuses[0].getPath();
+ long fileSize = errorFileStatuses[0].getLen();
+
+ // if more than one file matches the stderr pattern, take the latest
+ // modified file, and also append the file names in the diagnosticInfo
+ if (errorFileStatuses.length > 1) {
+ String[] errorFileNames = new String[errorFileStatuses.length];
+ long latestModifiedTime = errorFileStatuses[0].getModificationTime();
+ errorFileNames[0] = errorFileStatuses[0].getPath().getName();
+ for (int i = 1; i < errorFileStatuses.length; i++) {
+ errorFileNames[i] = errorFileStatuses[i].getPath().getName();
+ if (errorFileStatuses[i]
+ .getModificationTime() > latestModifiedTime) {
+ latestModifiedTime = errorFileStatuses[i].getModificationTime();
+ errorFile = errorFileStatuses[i].getPath();
+ fileSize = errorFileStatuses[i].getLen();
+ }
+ }
+ diagnosticInfo.append("Error files: ")
+ .append(StringUtils.join(", ", errorFileNames)).append(".\n");
+ }
+
+ long startPosition =
+ (fileSize < tailSizeInBytes) ? 0 : fileSize - tailSizeInBytes;
+ int bufferSize =
+ (int) ((fileSize < tailSizeInBytes) ? fileSize : tailSizeInBytes);
+ byte[] tailBuffer = new byte[bufferSize];
+ errorFileIS = fileSystem.open(errorFile);
+ errorFileIS.readFully(startPosition, tailBuffer);
+
+ diagnosticInfo.append("Last ").append(tailSizeInBytes)
+ .append(" bytes of ").append(errorFile.getName()).append(" :\n")
+ .append(new String(tailBuffer, StandardCharsets.UTF_8));
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to get tail of the container's error log file", e);
+ } finally {
+ IOUtils.cleanup(LOG, errorFileIS);
+ }
+
+ this.dispatcher.getEventHandler()
+ .handle(new ContainerExitEvent(containerID,
+ ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
+ diagnosticInfo.toString()));
+ }
+
protected String getPidFileSubpath(String appIdStr, String containerIdStr) {
return getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
+ String.format(ContainerLaunch.PID_FILE_NAME_FMT, containerIdStr);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
index 502ecdec666..aa41de70a17 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
@@ -21,7 +21,6 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail;
-import static org.junit.matchers.JUnitMatchers.containsString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@@ -48,6 +47,7 @@
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
+import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.Shell.ExitCodeException;
@@ -81,19 +81,21 @@
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
-import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.ShellScriptBuilder;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
-import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.Apps;
import org.apache.hadoop.yarn.util.AuxiliaryServiceHelper;
@@ -108,6 +110,7 @@
public class TestContainerLaunch extends BaseContainerManagerTest {
+ private static final String INVALID_JAVA_HOME = "/no/jvm/here";
protected Context distContext = new NMContext(new NMContainerTokenSecretManager(
conf), new NMTokenSecretManagerInNM(), null,
new ApplicationACLsManager(conf), new NMNullStateStoreService()) {
@@ -492,6 +495,147 @@ public void handle(Event event) {
}
+ @Test
+ public void testErrorLogOnContainerExit() throws Exception {
+ verifyTailErrorLogOnContainerExit(new Configuration(), "/stderr", false);
+ }
+
+ @Test
+ public void testErrorLogOnContainerExitForCase() throws Exception {
+ verifyTailErrorLogOnContainerExit(new Configuration(), "/STDERR.log",
+ false);
+ }
+
+ @Test
+ public void testErrorLogOnContainerExitForExt() throws Exception {
+ verifyTailErrorLogOnContainerExit(new Configuration(), "/AppMaster.stderr",
+ false);
+ }
+
+ @Test
+ public void testErrorLogOnContainerExitWithCustomPattern() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
+ "{*stderr*,*log*}");
+ verifyTailErrorLogOnContainerExit(conf, "/error.log", false);
+ }
+
+ @Test
+ public void testErrorLogOnContainerExitWithMultipleFiles() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
+ "{*stderr*,*stdout*}");
+ verifyTailErrorLogOnContainerExit(conf, "/stderr.log", true);
+ }
+
+ private void verifyTailErrorLogOnContainerExit(Configuration conf,
+ String errorFileName, boolean testForMultipleErrFiles) throws Exception {
+ Container container = mock(Container.class);
+ ApplicationId appId =
+ ApplicationId.newInstance(System.currentTimeMillis(), 1);
+ ContainerId containerId = ContainerId
+ .newContainerId(ApplicationAttemptId.newInstance(appId, 1), 1);
+ when(container.getContainerId()).thenReturn(containerId);
+ when(container.getUser()).thenReturn("test");
+ String relativeContainerLogDir = ContainerLaunch.getRelativeContainerLogDir(
+ appId.toString(), ConverterUtils.toString(containerId));
+ Path containerLogDir =
+ dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
+
+ ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
+ List invalidCommand = new ArrayList();
+ invalidCommand.add("$JAVA_HOME/bin/java");
+ invalidCommand.add("-Djava.io.tmpdir=$PWD/tmp");
+ invalidCommand.add("-Dlog4j.configuration=container-log4j.properties");
+ invalidCommand.add("-Dyarn.app.container.log.dir=" + containerLogDir);
+ invalidCommand.add("-Dyarn.app.container.log.filesize=0");
+ invalidCommand.add("-Dhadoop.root.logger=INFO,CLA");
+ invalidCommand.add("-Dhadoop.root.logfile=syslog");
+ invalidCommand.add("-Xmx1024m");
+ invalidCommand.add("org.apache.hadoop.mapreduce.v2.app.MRAppMaster");
+ invalidCommand.add("1>" + containerLogDir + "/stdout");
+ invalidCommand.add("2>" + containerLogDir + errorFileName);
+ when(clc.getCommands()).thenReturn(invalidCommand);
+
+ Map userSetEnv = new HashMap();
+ userSetEnv.put(Environment.CONTAINER_ID.name(), "user_set_container_id");
+ userSetEnv.put("JAVA_HOME", INVALID_JAVA_HOME);
+ userSetEnv.put(Environment.NM_HOST.name(), "user_set_NM_HOST");
+ userSetEnv.put(Environment.NM_PORT.name(), "user_set_NM_PORT");
+ userSetEnv.put(Environment.NM_HTTP_PORT.name(), "user_set_NM_HTTP_PORT");
+ userSetEnv.put(Environment.LOCAL_DIRS.name(), "user_set_LOCAL_DIR");
+ userSetEnv.put(Environment.USER.key(),
+ "user_set_" + Environment.USER.key());
+ userSetEnv.put(Environment.LOGNAME.name(), "user_set_LOGNAME");
+ userSetEnv.put(Environment.PWD.name(), "user_set_PWD");
+ userSetEnv.put(Environment.HOME.name(), "user_set_HOME");
+ userSetEnv.put(Environment.CLASSPATH.name(), "APATH");
+ when(clc.getEnvironment()).thenReturn(userSetEnv);
+ when(container.getLaunchContext()).thenReturn(clc);
+
+ when(container.getLocalizedResources())
+ .thenReturn(Collections.> emptyMap());
+ Dispatcher dispatcher = mock(Dispatcher.class);
+
+ @SuppressWarnings("rawtypes")
+ ContainerExitHandler eventHandler =
+ new ContainerExitHandler(testForMultipleErrFiles);
+ when(dispatcher.getEventHandler()).thenReturn(eventHandler);
+
+ Application app = mock(Application.class);
+ when(app.getAppId()).thenReturn(appId);
+ when(app.getUser()).thenReturn("test");
+
+ Credentials creds = mock(Credentials.class);
+ when(container.getCredentials()).thenReturn(creds);
+
+ ((NMContext) context).setNodeId(NodeId.newInstance("127.0.0.1", HTTP_PORT));
+
+ ContainerLaunch launch = new ContainerLaunch(context, conf, dispatcher,
+ exec, app, container, dirsHandler, containerManager);
+ launch.call();
+ Assert.assertTrue("ContainerExitEvent should have occured",
+ eventHandler.isContainerExitEventOccured());
+ }
+
+ private static class ContainerExitHandler
+ implements EventHandler {
+ private boolean testForMultiFile;
+
+ ContainerExitHandler(boolean testForMultiFile) {
+ this.testForMultiFile = testForMultiFile;
+ }
+
+ boolean containerExitEventOccured = false;
+
+ public boolean isContainerExitEventOccured() {
+ return containerExitEventOccured;
+ }
+
+ public void handle(ContainerEvent event) {
+ if (event instanceof ContainerExitEvent) {
+ containerExitEventOccured = true;
+ ContainerExitEvent exitEvent = (ContainerExitEvent) event;
+ Assert.assertEquals(ContainerEventType.CONTAINER_EXITED_WITH_FAILURE,
+ exitEvent.getType());
+ LOG.info("Diagnostic Info : " + exitEvent.getDiagnosticInfo());
+ if (testForMultiFile) {
+ Assert.assertTrue("Should contain the Multi file information",
+ exitEvent.getDiagnosticInfo().contains("Error files: "));
+ }
+ Assert.assertTrue(
+ "Should contain the error Log message with tail size info",
+ exitEvent.getDiagnosticInfo()
+ .contains("Last "
+ + YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES
+ + " bytes of"));
+ Assert.assertTrue("Should contain contents of error Log",
+ exitEvent.getDiagnosticInfo().contains(
+ INVALID_JAVA_HOME + "/bin/java: No such file or directory"));
+ }
+ }
+ }
+
private static List getJarManifestClasspath(String path)
throws Exception {
List classpath = new ArrayList();