YARN-3809. Failed to launch new attempts because ApplicationMasterLauncher's threads all hang. Contributed by Jun Gong
(cherry picked from commit 2a20dd9b61
)
This commit is contained in:
parent
2762affbb2
commit
14afa5d53d
|
@ -655,6 +655,9 @@ Release 2.7.1 - UNRELEASED
|
||||||
YARN-3842. NMProxy should retry on NMNotYetReadyException.
|
YARN-3842. NMProxy should retry on NMNotYetReadyException.
|
||||||
(Robert Kanter via kasha)
|
(Robert Kanter via kasha)
|
||||||
|
|
||||||
|
YARN-3809. Failed to launch new attempts because
|
||||||
|
ApplicationMasterLauncher's threads all hang (Jun Gong via jlowe)
|
||||||
|
|
||||||
Release 2.7.0 - 2015-04-20
|
Release 2.7.0 - 2015-04-20
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -143,6 +143,16 @@ public class YarnConfiguration extends Configuration {
|
||||||
RM_PREFIX + "client.thread-count";
|
RM_PREFIX + "client.thread-count";
|
||||||
public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50;
|
public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50;
|
||||||
|
|
||||||
|
/** Number of threads used to launch/cleanup AM.*/
|
||||||
|
public static final String RM_AMLAUNCHER_THREAD_COUNT =
|
||||||
|
RM_PREFIX + "amlauncher.thread-count";
|
||||||
|
public static final int DEFAULT_RM_AMLAUNCHER_THREAD_COUNT = 50;
|
||||||
|
|
||||||
|
/** Retry times to connect with NM.*/
|
||||||
|
public static final String RM_NODEMANAGER_CONNECT_RETIRES =
|
||||||
|
RM_PREFIX + "nodemanager-connect-retries";
|
||||||
|
public static final int DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES = 10;
|
||||||
|
|
||||||
/** The Kerberos principal for the resource manager.*/
|
/** The Kerberos principal for the resource manager.*/
|
||||||
public static final String RM_PRINCIPAL =
|
public static final String RM_PRINCIPAL =
|
||||||
RM_PREFIX + "principal";
|
RM_PREFIX + "principal";
|
||||||
|
|
|
@ -76,6 +76,18 @@
|
||||||
<value>50</value>
|
<value>50</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Number of threads used to launch/cleanup AM.</description>
|
||||||
|
<name>yarn.resourcemanager.amlauncher.thread-count</name>
|
||||||
|
<value>50</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Retry times to connect with NM.</description>
|
||||||
|
<name>yarn.resourcemanager.nodemanager-connect-retries</name>
|
||||||
|
<value>10</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>The expiry interval for application master reporting.</description>
|
<description>The expiry interval for application master reporting.</description>
|
||||||
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>
|
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>
|
||||||
|
|
|
@ -19,12 +19,17 @@
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.amlauncher;
|
package org.apache.hadoop.yarn.server.resourcemanager.amlauncher;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
import java.util.concurrent.ThreadFactory;
|
||||||
import java.util.concurrent.ThreadPoolExecutor;
|
import java.util.concurrent.ThreadPoolExecutor;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.EventHandler;
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
||||||
|
@ -34,7 +39,7 @@ public class ApplicationMasterLauncher extends AbstractService implements
|
||||||
EventHandler<AMLauncherEvent> {
|
EventHandler<AMLauncherEvent> {
|
||||||
private static final Log LOG = LogFactory.getLog(
|
private static final Log LOG = LogFactory.getLog(
|
||||||
ApplicationMasterLauncher.class);
|
ApplicationMasterLauncher.class);
|
||||||
private final ThreadPoolExecutor launcherPool;
|
private ThreadPoolExecutor launcherPool;
|
||||||
private LauncherThread launcherHandlingThread;
|
private LauncherThread launcherHandlingThread;
|
||||||
|
|
||||||
private final BlockingQueue<Runnable> masterEvents
|
private final BlockingQueue<Runnable> masterEvents
|
||||||
|
@ -45,11 +50,30 @@ public class ApplicationMasterLauncher extends AbstractService implements
|
||||||
public ApplicationMasterLauncher(RMContext context) {
|
public ApplicationMasterLauncher(RMContext context) {
|
||||||
super(ApplicationMasterLauncher.class.getName());
|
super(ApplicationMasterLauncher.class.getName());
|
||||||
this.context = context;
|
this.context = context;
|
||||||
this.launcherPool = new ThreadPoolExecutor(10, 10, 1,
|
|
||||||
TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>());
|
|
||||||
this.launcherHandlingThread = new LauncherThread();
|
this.launcherHandlingThread = new LauncherThread();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void serviceInit(Configuration conf) throws Exception {
|
||||||
|
int threadCount = conf.getInt(
|
||||||
|
YarnConfiguration.RM_AMLAUNCHER_THREAD_COUNT,
|
||||||
|
YarnConfiguration.DEFAULT_RM_AMLAUNCHER_THREAD_COUNT);
|
||||||
|
ThreadFactory tf = new ThreadFactoryBuilder()
|
||||||
|
.setNameFormat("ApplicationMasterLauncher #%d")
|
||||||
|
.build();
|
||||||
|
launcherPool = new ThreadPoolExecutor(threadCount, threadCount, 1,
|
||||||
|
TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>());
|
||||||
|
launcherPool.setThreadFactory(tf);
|
||||||
|
|
||||||
|
Configuration newConf = new YarnConfiguration(conf);
|
||||||
|
newConf.setInt(CommonConfigurationKeysPublic.
|
||||||
|
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
|
||||||
|
conf.getInt(YarnConfiguration.RM_NODEMANAGER_CONNECT_RETIRES,
|
||||||
|
YarnConfiguration.DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES));
|
||||||
|
setConfig(newConf);
|
||||||
|
super.serviceInit(newConf);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void serviceStart() throws Exception {
|
protected void serviceStart() throws Exception {
|
||||||
launcherHandlingThread.start();
|
launcherHandlingThread.start();
|
||||||
|
|
Loading…
Reference in New Issue