YARN-2934. Improve handling of container's stderr. (Naganarasimha G R via gera)

(cherry picked from commit 2c17b81569)
This commit is contained in:
Gera Shegalov 2015-12-24 22:19:30 -08:00
parent c6e0f64582
commit 8baf9369bb
5 changed files with 272 additions and 12 deletions

View File

@ -28,6 +28,9 @@ Release 2.9.0 - UNRELEASED
YARN-4156. TestAMRestart#testAMBlacklistPreventsRestartOnSameNode
assumes CapacityScheduler. (Anubhav Dhoot via kasha)
YARN-2934. Improve handling of container's stderr.
(Naganarasimha G R via gera)
Release 2.8.0 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -1332,7 +1332,18 @@ public class YarnConfiguration extends Configuration {
public static final String NM_USER_HOME_DIR =
NM_PREFIX + "user-home-dir";
public static final String NM_CONTAINER_STDERR_PATTERN =
NM_PREFIX + "container.stderr.pattern";
public static final String DEFAULT_NM_CONTAINER_STDERR_PATTERN =
"{*stderr*,*STDERR*}";
public static final String NM_CONTAINER_STDERR_BYTES =
NM_PREFIX + "container.stderr.tail.bytes";
public static final long DEFAULT_NM_CONTAINER_STDERR_BYTES = 4 * 1024;
/**The kerberos principal to be used for spnego filter for NM.*/
public static final String NM_WEBAPP_SPNEGO_USER_NAME_KEY =
NM_PREFIX + "webapp.spnego-principal";

View File

@ -2459,6 +2459,29 @@
<value>org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor</value>
</property>
<property>
<description>
Error filename pattern, to identify the file in the container's
Log directory which contain the container's error log. As error file
redirection is done by client/AM and yarn will not be aware of the error
file name. YARN uses this pattern to identify the error file and tail
the error log as diagnostics when the container execution returns non zero
value. Filename patterns are case sensitive and should match the
specifications of FileSystem.globStatus(Path) api. If multiple filenames
matches the pattern, first file matching the pattern will be picked.
</description>
<name>yarn.nodemanager.container.stderr.pattern</name>
<value>{*stderr*,*STDERR*}</value>
</property>
<property>
<description>
Size of the container error file which needs to be tailed, in bytes.
</description>
<name>yarn.nodemanager.container.stderr.tail.bytes </name>
<value>4096</value>
</property>
<property>
<description>
Enable/disable blacklisting of hosts for AM based on AM failures on those

View File

@ -26,6 +26,7 @@ import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
@ -38,7 +39,10 @@ import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.Path;
@ -61,6 +65,7 @@ import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@ -71,7 +76,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
import org.apache.hadoop.yarn.server.nodemanager.WindowsSecureContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext;
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerStartContext;
import org.apache.hadoop.yarn.server.nodemanager.util.ProcessIdFileReader;
@ -171,6 +175,7 @@ public class ContainerLaunch implements Callable<Integer> {
return 0;
}
Path containerLogDir;
try {
localResources = container.getLocalizedResources();
if (localResources == null) {
@ -186,7 +191,7 @@ public class ContainerLaunch implements Callable<Integer> {
String appIdStr = app.getAppId().toString();
String relativeContainerLogDir = ContainerLaunch
.getRelativeContainerLogDir(appIdStr, containerIdStr);
Path containerLogDir =
containerLogDir =
dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
for (String str : command) {
// TODO: Should we instead work via symlinks without this grammar?
@ -334,6 +339,11 @@ public class ContainerLaunch implements Callable<Integer> {
LOG.debug("Container " + containerIdStr + " completed with exit code "
+ ret);
}
StringBuilder diagnosticInfo =
new StringBuilder("Container exited with a non-zero exit code ");
diagnosticInfo.append(ret);
diagnosticInfo.append(". ");
if (ret == ExitCode.FORCE_KILLED.getExitCode()
|| ret == ExitCode.TERMINATED.getExitCode()) {
// If the process was killed, Send container_cleanedup_after_kill and
@ -341,16 +351,13 @@ public class ContainerLaunch implements Callable<Integer> {
dispatcher.getEventHandler().handle(
new ContainerExitEvent(containerID,
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret,
"Container exited with a non-zero exit code " + ret));
diagnosticInfo.toString()));
return ret;
}
if (ret != 0) {
LOG.warn("Container exited with a non-zero exit code " + ret);
this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
containerID,
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
"Container exited with a non-zero exit code " + ret));
handleContainerExitWithFailure(containerID, ret, containerLogDir,
diagnosticInfo);
return ret;
}
@ -361,6 +368,78 @@ public class ContainerLaunch implements Callable<Integer> {
return 0;
}
/**
* Tries to tail and fetch TAIL_SIZE_IN_BYTES of data from the error log.
* ErrorLog filename is not fixed and depends upon app, hence file name
* pattern is used.
* @param containerID
* @param ret
* @param containerLogDir
* @param diagnosticInfo
*/
@SuppressWarnings("unchecked")
private void handleContainerExitWithFailure(ContainerId containerID, int ret,
Path containerLogDir, StringBuilder diagnosticInfo) {
LOG.warn(diagnosticInfo);
String errorFileNamePattern =
conf.get(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_PATTERN);
FSDataInputStream errorFileIS = null;
try {
FileSystem fileSystem = FileSystem.getLocal(conf).getRaw();
FileStatus[] errorFileStatuses = fileSystem
.globStatus(new Path(containerLogDir, errorFileNamePattern));
if (errorFileStatuses != null && errorFileStatuses.length != 0) {
long tailSizeInBytes =
conf.getLong(YarnConfiguration.NM_CONTAINER_STDERR_BYTES,
YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES);
Path errorFile = errorFileStatuses[0].getPath();
long fileSize = errorFileStatuses[0].getLen();
// if more than one file matches the stderr pattern, take the latest
// modified file, and also append the file names in the diagnosticInfo
if (errorFileStatuses.length > 1) {
String[] errorFileNames = new String[errorFileStatuses.length];
long latestModifiedTime = errorFileStatuses[0].getModificationTime();
errorFileNames[0] = errorFileStatuses[0].getPath().getName();
for (int i = 1; i < errorFileStatuses.length; i++) {
errorFileNames[i] = errorFileStatuses[i].getPath().getName();
if (errorFileStatuses[i]
.getModificationTime() > latestModifiedTime) {
latestModifiedTime = errorFileStatuses[i].getModificationTime();
errorFile = errorFileStatuses[i].getPath();
fileSize = errorFileStatuses[i].getLen();
}
}
diagnosticInfo.append("Error files: ")
.append(StringUtils.join(", ", errorFileNames)).append(".\n");
}
long startPosition =
(fileSize < tailSizeInBytes) ? 0 : fileSize - tailSizeInBytes;
int bufferSize =
(int) ((fileSize < tailSizeInBytes) ? fileSize : tailSizeInBytes);
byte[] tailBuffer = new byte[bufferSize];
errorFileIS = fileSystem.open(errorFile);
errorFileIS.readFully(startPosition, tailBuffer);
diagnosticInfo.append("Last ").append(tailSizeInBytes)
.append(" bytes of ").append(errorFile.getName()).append(" :\n")
.append(new String(tailBuffer, StandardCharsets.UTF_8));
}
} catch (IOException e) {
LOG.error("Failed to get tail of the container's error log file", e);
} finally {
IOUtils.cleanup(LOG, errorFileIS);
}
this.dispatcher.getEventHandler()
.handle(new ContainerExitEvent(containerID,
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
diagnosticInfo.toString()));
}
protected String getPidFileSubpath(String appIdStr, String containerIdStr) {
return getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
+ String.format(ContainerLaunch.PID_FILE_NAME_FMT, containerIdStr);

View File

@ -21,7 +21,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail;
import static org.junit.matchers.JUnitMatchers.containsString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@ -48,6 +47,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.Shell.ExitCodeException;
@ -81,19 +81,21 @@ import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.ShellScriptBuilder;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.Apps;
import org.apache.hadoop.yarn.util.AuxiliaryServiceHelper;
@ -108,6 +110,7 @@ import org.junit.Test;
public class TestContainerLaunch extends BaseContainerManagerTest {
private static final String INVALID_JAVA_HOME = "/no/jvm/here";
protected Context distContext = new NMContext(new NMContainerTokenSecretManager(
conf), new NMTokenSecretManagerInNM(), null,
new ApplicationACLsManager(conf), new NMNullStateStoreService()) {
@ -492,6 +495,147 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
}
@Test
public void testErrorLogOnContainerExit() throws Exception {
verifyTailErrorLogOnContainerExit(new Configuration(), "/stderr", false);
}
@Test
public void testErrorLogOnContainerExitForCase() throws Exception {
verifyTailErrorLogOnContainerExit(new Configuration(), "/STDERR.log",
false);
}
@Test
public void testErrorLogOnContainerExitForExt() throws Exception {
verifyTailErrorLogOnContainerExit(new Configuration(), "/AppMaster.stderr",
false);
}
@Test
public void testErrorLogOnContainerExitWithCustomPattern() throws Exception {
Configuration conf = new Configuration();
conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
"{*stderr*,*log*}");
verifyTailErrorLogOnContainerExit(conf, "/error.log", false);
}
@Test
public void testErrorLogOnContainerExitWithMultipleFiles() throws Exception {
Configuration conf = new Configuration();
conf.setStrings(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
"{*stderr*,*stdout*}");
verifyTailErrorLogOnContainerExit(conf, "/stderr.log", true);
}
private void verifyTailErrorLogOnContainerExit(Configuration conf,
String errorFileName, boolean testForMultipleErrFiles) throws Exception {
Container container = mock(Container.class);
ApplicationId appId =
ApplicationId.newInstance(System.currentTimeMillis(), 1);
ContainerId containerId = ContainerId
.newContainerId(ApplicationAttemptId.newInstance(appId, 1), 1);
when(container.getContainerId()).thenReturn(containerId);
when(container.getUser()).thenReturn("test");
String relativeContainerLogDir = ContainerLaunch.getRelativeContainerLogDir(
appId.toString(), ConverterUtils.toString(containerId));
Path containerLogDir =
dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
List<String> invalidCommand = new ArrayList<String>();
invalidCommand.add("$JAVA_HOME/bin/java");
invalidCommand.add("-Djava.io.tmpdir=$PWD/tmp");
invalidCommand.add("-Dlog4j.configuration=container-log4j.properties");
invalidCommand.add("-Dyarn.app.container.log.dir=" + containerLogDir);
invalidCommand.add("-Dyarn.app.container.log.filesize=0");
invalidCommand.add("-Dhadoop.root.logger=INFO,CLA");
invalidCommand.add("-Dhadoop.root.logfile=syslog");
invalidCommand.add("-Xmx1024m");
invalidCommand.add("org.apache.hadoop.mapreduce.v2.app.MRAppMaster");
invalidCommand.add("1>" + containerLogDir + "/stdout");
invalidCommand.add("2>" + containerLogDir + errorFileName);
when(clc.getCommands()).thenReturn(invalidCommand);
Map<String, String> userSetEnv = new HashMap<String, String>();
userSetEnv.put(Environment.CONTAINER_ID.name(), "user_set_container_id");
userSetEnv.put("JAVA_HOME", INVALID_JAVA_HOME);
userSetEnv.put(Environment.NM_HOST.name(), "user_set_NM_HOST");
userSetEnv.put(Environment.NM_PORT.name(), "user_set_NM_PORT");
userSetEnv.put(Environment.NM_HTTP_PORT.name(), "user_set_NM_HTTP_PORT");
userSetEnv.put(Environment.LOCAL_DIRS.name(), "user_set_LOCAL_DIR");
userSetEnv.put(Environment.USER.key(),
"user_set_" + Environment.USER.key());
userSetEnv.put(Environment.LOGNAME.name(), "user_set_LOGNAME");
userSetEnv.put(Environment.PWD.name(), "user_set_PWD");
userSetEnv.put(Environment.HOME.name(), "user_set_HOME");
userSetEnv.put(Environment.CLASSPATH.name(), "APATH");
when(clc.getEnvironment()).thenReturn(userSetEnv);
when(container.getLaunchContext()).thenReturn(clc);
when(container.getLocalizedResources())
.thenReturn(Collections.<Path, List<String>> emptyMap());
Dispatcher dispatcher = mock(Dispatcher.class);
@SuppressWarnings("rawtypes")
ContainerExitHandler eventHandler =
new ContainerExitHandler(testForMultipleErrFiles);
when(dispatcher.getEventHandler()).thenReturn(eventHandler);
Application app = mock(Application.class);
when(app.getAppId()).thenReturn(appId);
when(app.getUser()).thenReturn("test");
Credentials creds = mock(Credentials.class);
when(container.getCredentials()).thenReturn(creds);
((NMContext) context).setNodeId(NodeId.newInstance("127.0.0.1", HTTP_PORT));
ContainerLaunch launch = new ContainerLaunch(context, conf, dispatcher,
exec, app, container, dirsHandler, containerManager);
launch.call();
Assert.assertTrue("ContainerExitEvent should have occured",
eventHandler.isContainerExitEventOccured());
}
private static class ContainerExitHandler
implements EventHandler<ContainerEvent> {
private boolean testForMultiFile;
ContainerExitHandler(boolean testForMultiFile) {
this.testForMultiFile = testForMultiFile;
}
boolean containerExitEventOccured = false;
public boolean isContainerExitEventOccured() {
return containerExitEventOccured;
}
public void handle(ContainerEvent event) {
if (event instanceof ContainerExitEvent) {
containerExitEventOccured = true;
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
Assert.assertEquals(ContainerEventType.CONTAINER_EXITED_WITH_FAILURE,
exitEvent.getType());
LOG.info("Diagnostic Info : " + exitEvent.getDiagnosticInfo());
if (testForMultiFile) {
Assert.assertTrue("Should contain the Multi file information",
exitEvent.getDiagnosticInfo().contains("Error files: "));
}
Assert.assertTrue(
"Should contain the error Log message with tail size info",
exitEvent.getDiagnosticInfo()
.contains("Last "
+ YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES
+ " bytes of"));
Assert.assertTrue("Should contain contents of error Log",
exitEvent.getDiagnosticInfo().contains(
INVALID_JAVA_HOME + "/bin/java: No such file or directory"));
}
}
}
private static List<String> getJarManifestClasspath(String path)
throws Exception {
List<String> classpath = new ArrayList<String>();