MAPREDUCE-4300. OOM in AM can turn it into a zombie. (Robert Evans via tgraves)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1359399 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
820be7cbef
commit
11782dd3a5
|
@ -664,6 +664,9 @@ Release 0.23.3 - UNRELEASED
|
|||
MAPREDUCE-4402. TestFileInputFormat fails intermittently (Jason Lowe via
|
||||
bobby)
|
||||
|
||||
MAPREDUCE-4300. OOM in AM can turn it into a zombie. (Robert Evans via
|
||||
tgraves)
|
||||
|
||||
Release 0.23.2 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -58,6 +58,7 @@ import org.apache.hadoop.security.token.Token;
|
|||
import org.apache.hadoop.security.token.TokenIdentifier;
|
||||
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||
import org.apache.log4j.LogManager;
|
||||
|
||||
|
@ -71,6 +72,7 @@ class YarnChild {
|
|||
static volatile TaskAttemptID taskid = null;
|
||||
|
||||
public static void main(String[] args) throws Throwable {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
LOG.debug("Child starting");
|
||||
|
||||
final JobConf defaultConf = new JobConf();
|
||||
|
|
|
@ -95,6 +95,7 @@ import org.apache.hadoop.yarn.Clock;
|
|||
import org.apache.hadoop.yarn.ClusterInfo;
|
||||
import org.apache.hadoop.yarn.SystemClock;
|
||||
import org.apache.hadoop.yarn.YarnException;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
|
@ -969,6 +970,7 @@ public class MRAppMaster extends CompositeService {
|
|||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
String containerIdStr =
|
||||
System.getenv(ApplicationConstants.AM_CONTAINER_ID_ENV);
|
||||
String nodeHostString = System.getenv(ApplicationConstants.NM_HOST_ENV);
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.security.SecurityUtil;
|
|||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.YarnException;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.service.CompositeService;
|
||||
|
@ -122,6 +123,7 @@ public class JobHistoryServer extends CompositeService {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
StringUtils.startupShutdownMessage(JobHistoryServer.class, args, LOG);
|
||||
try {
|
||||
JobHistoryServer jobHistoryServer = new JobHistoryServer();
|
||||
|
|
|
@ -187,6 +187,11 @@
|
|||
<Class name="org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer$DelegationTokenCancelThread" />
|
||||
<Bug pattern="DM_EXIT" />
|
||||
</Match>
|
||||
<Match>
|
||||
<Class name="org.apache.hadoop.yarn.YarnUncaughtExceptionHandler"/>
|
||||
<Bug pattern="DM_EXIT"/>
|
||||
</Match>
|
||||
|
||||
<!-- AsyncDispatcher will kill the process if there is an error dispatching -->
|
||||
<Match>
|
||||
<Class name="org.apache.hadoop.yarn.event.AsyncDispatcher" />
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn;
|
||||
|
||||
import java.lang.Thread.UncaughtExceptionHandler;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
|
||||
/**
|
||||
* This class is intended to be installed by calling
|
||||
* {@link Thread#setDefaultUncaughtExceptionHandler(UncaughtExceptionHandler)}
|
||||
* In the main entry point. It is intended to try and cleanly shut down
|
||||
* programs using the Yarn Event framework.
|
||||
*
|
||||
* Note: Right now it only will shut down the program if a Error is caught, but
|
||||
* not any other exception. Anything else is just logged.
|
||||
*/
|
||||
public class YarnUncaughtExceptionHandler implements UncaughtExceptionHandler {
|
||||
private static final Log LOG = LogFactory.getLog(YarnUncaughtExceptionHandler.class);
|
||||
|
||||
@Override
|
||||
public void uncaughtException(Thread t, Throwable e) {
|
||||
if(ShutdownHookManager.get().isShutdownInProgress()) {
|
||||
LOG.error("Thread " + t + " threw an Throwable, but we are shutting " +
|
||||
"down, so ignoring this", e);
|
||||
} else if(e instanceof Error) {
|
||||
try {
|
||||
LOG.fatal("Thread " + t + " threw an Error. Shutting down now...", e);
|
||||
} catch (Throwable err) {
|
||||
//We don't want to not exit because of an issue with logging
|
||||
}
|
||||
if(e instanceof OutOfMemoryError) {
|
||||
//After catching an OOM java says it is undefined behavior, so don't
|
||||
//even try to clean up or we can get stuck on shutdown.
|
||||
try {
|
||||
System.err.println("Halting due to Out Of Memory Error...");
|
||||
} catch (Throwable err) {
|
||||
//Again we done want to exit because of logging issues.
|
||||
}
|
||||
Runtime.getRuntime().halt(-1);
|
||||
} else {
|
||||
System.exit(-1);
|
||||
}
|
||||
} else {
|
||||
LOG.error("Thread " + t + " threw an Exception.", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -33,6 +33,7 @@ import org.apache.hadoop.util.ReflectionUtils;
|
|||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.YarnException;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.NodeHealthStatus;
|
||||
|
@ -279,6 +280,7 @@ public class NodeManager extends CompositeService implements
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
StringUtils.startupShutdownMessage(NodeManager.class, args, LOG);
|
||||
NodeManager nodeManager = new NodeManager();
|
||||
nodeManager.initAndStartNodeManager(false);
|
||||
|
|
|
@ -50,6 +50,7 @@ import org.apache.hadoop.security.Credentials;
|
|||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.security.token.TokenIdentifier;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
|
@ -315,6 +316,7 @@ public class ContainerLocalizer {
|
|||
}
|
||||
|
||||
public static void main(String[] argv) throws Throwable {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
// usage: $0 user appId locId host port app_log_dir user_dir [user_dir]*
|
||||
// let $x = $x/usercache for $local.dir
|
||||
// MKDIR $x/$user/appcache/$appid
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.hadoop.util.ReflectionUtils;
|
|||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.YarnException;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||
|
@ -622,6 +623,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
|||
}
|
||||
|
||||
public static void main(String argv[]) {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
StringUtils.startupShutdownMessage(ResourceManager.class, argv, LOG);
|
||||
try {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.hadoop.security.SecurityUtil;
|
|||
import org.apache.hadoop.util.ShutdownHookManager;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.YarnException;
|
||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.service.CompositeService;
|
||||
|
||||
|
@ -73,6 +74,7 @@ public class WebAppProxyServer extends CompositeService {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
|
||||
StringUtils.startupShutdownMessage(WebAppProxyServer.class, args, LOG);
|
||||
try {
|
||||
WebAppProxyServer proxy = new WebAppProxyServer();
|
||||
|
|
Loading…
Reference in New Issue