MAPREDUCE-4300. OOM in AM can turn it into a zombie. (Robert Evans via tgraves)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1359399 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Thomas Graves 2012-07-09 21:08:40 +00:00
parent 820be7cbef
commit 11782dd3a5
10 changed files with 88 additions and 0 deletions

View File

@ -664,6 +664,9 @@ Release 0.23.3 - UNRELEASED
MAPREDUCE-4402. TestFileInputFormat fails intermittently (Jason Lowe via
bobby)
MAPREDUCE-4300. OOM in AM can turn it into a zombie. (Robert Evans via
tgraves)
Release 0.23.2 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -58,6 +58,7 @@
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.log4j.LogManager;
@ -71,6 +72,7 @@ class YarnChild {
static volatile TaskAttemptID taskid = null;
public static void main(String[] args) throws Throwable {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
LOG.debug("Child starting");
final JobConf defaultConf = new JobConf();

View File

@ -95,6 +95,7 @@
import org.apache.hadoop.yarn.ClusterInfo;
import org.apache.hadoop.yarn.SystemClock;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@ -969,6 +970,7 @@ private static void validateInputParam(String value, String param)
public static void main(String[] args) {
try {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
String containerIdStr =
System.getenv(ApplicationConstants.AM_CONTAINER_ID_ENV);
String nodeHostString = System.getenv(ApplicationConstants.NM_HOST_ENV);

View File

@ -31,6 +31,7 @@
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.service.CompositeService;
@ -122,6 +123,7 @@ public HistoryClientService getClientService() {
}
public static void main(String[] args) {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
StringUtils.startupShutdownMessage(JobHistoryServer.class, args, LOG);
try {
JobHistoryServer jobHistoryServer = new JobHistoryServer();

View File

@ -187,6 +187,11 @@
<Class name="org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer$DelegationTokenCancelThread" />
<Bug pattern="DM_EXIT" />
</Match>
<Match>
<Class name="org.apache.hadoop.yarn.YarnUncaughtExceptionHandler"/>
<Bug pattern="DM_EXIT"/>
</Match>
<!-- AsyncDispatcher will kill the process if there is an error dispatching -->
<Match>
<Class name="org.apache.hadoop.yarn.event.AsyncDispatcher" />

View File

@ -0,0 +1,66 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn;
import java.lang.Thread.UncaughtExceptionHandler;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.ShutdownHookManager;
/**
* This class is intended to be installed by calling
* {@link Thread#setDefaultUncaughtExceptionHandler(UncaughtExceptionHandler)}
* In the main entry point. It is intended to try and cleanly shut down
* programs using the Yarn Event framework.
*
* Note: Right now it only will shut down the program if a Error is caught, but
* not any other exception. Anything else is just logged.
*/
public class YarnUncaughtExceptionHandler implements UncaughtExceptionHandler {
private static final Log LOG = LogFactory.getLog(YarnUncaughtExceptionHandler.class);
@Override
public void uncaughtException(Thread t, Throwable e) {
if(ShutdownHookManager.get().isShutdownInProgress()) {
LOG.error("Thread " + t + " threw an Throwable, but we are shutting " +
"down, so ignoring this", e);
} else if(e instanceof Error) {
try {
LOG.fatal("Thread " + t + " threw an Error. Shutting down now...", e);
} catch (Throwable err) {
//We don't want to not exit because of an issue with logging
}
if(e instanceof OutOfMemoryError) {
//After catching an OOM java says it is undefined behavior, so don't
//even try to clean up or we can get stuck on shutdown.
try {
System.err.println("Halting due to Out Of Memory Error...");
} catch (Throwable err) {
//Again we done want to exit because of logging issues.
}
Runtime.getRuntime().halt(-1);
} else {
System.exit(-1);
}
} else {
LOG.error("Thread " + t + " threw an Exception.", e);
}
}
}

View File

@ -33,6 +33,7 @@
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.NodeHealthStatus;
@ -279,6 +280,7 @@ NodeManager createNewNodeManager() {
}
public static void main(String[] args) {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
StringUtils.startupShutdownMessage(NodeManager.class, args, LOG);
NodeManager nodeManager = new NodeManager();
nodeManager.initAndStartNodeManager(false);

View File

@ -50,6 +50,7 @@
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.factories.RecordFactory;
@ -315,6 +316,7 @@ private LocalizerStatus createStatus() throws InterruptedException {
}
public static void main(String[] argv) throws Throwable {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
// usage: $0 user appId locId host port app_log_dir user_dir [user_dir]*
// let $x = $x/usercache for $local.dir
// MKDIR $x/$user/appcache/$appid

View File

@ -34,6 +34,7 @@
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.NodeId;
@ -622,6 +623,7 @@ public void recover(RMState state) throws Exception {
}
public static void main(String argv[]) {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
StringUtils.startupShutdownMessage(ResourceManager.class, argv, LOG);
try {
Configuration conf = new YarnConfiguration();

View File

@ -27,6 +27,7 @@
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.service.CompositeService;
@ -73,6 +74,7 @@ protected void doSecureLogin(Configuration conf) throws IOException {
}
public static void main(String[] args) {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
StringUtils.startupShutdownMessage(WebAppProxyServer.class, args, LOG);
try {
WebAppProxyServer proxy = new WebAppProxyServer();