diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index f23533825c5..7ecdee3171f 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -699,6 +699,9 @@ Release 2.7.1 - UNRELEASED YARN-3842. NMProxy should retry on NMNotYetReadyException. (Robert Kanter via kasha) + YARN-3809. Failed to launch new attempts because + ApplicationMasterLauncher's threads all hang (Jun Gong via jlowe) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 5d75a214255..6b660f713b4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -143,6 +143,16 @@ private static void addDeprecatedKeys() { RM_PREFIX + "client.thread-count"; public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50; + /** Number of threads used to launch/cleanup AM.*/ + public static final String RM_AMLAUNCHER_THREAD_COUNT = + RM_PREFIX + "amlauncher.thread-count"; + public static final int DEFAULT_RM_AMLAUNCHER_THREAD_COUNT = 50; + + /** Retry times to connect with NM.*/ + public static final String RM_NODEMANAGER_CONNECT_RETIRES = + RM_PREFIX + "nodemanager-connect-retries"; + public static final int DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES = 10; + /** The Kerberos principal for the resource manager.*/ public static final String RM_PRINCIPAL = RM_PREFIX + "principal"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index d94157cfed6..621198cce40 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -76,6 +76,18 @@ 50 + + Number of threads used to launch/cleanup AM. + yarn.resourcemanager.amlauncher.thread-count + 50 + + + + Retry times to connect with NM. + yarn.resourcemanager.nodemanager-connect-retries + 10 + + The expiry interval for application master reporting. yarn.am.liveness-monitor.expiry-interval-ms diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java index 5fc39fd2efc..f606e45fdf4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java @@ -19,12 +19,17 @@ package org.apache.hadoop.yarn.server.resourcemanager.amlauncher; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; @@ -34,7 +39,7 @@ public class ApplicationMasterLauncher extends AbstractService implements EventHandler { private static final Log LOG = LogFactory.getLog( ApplicationMasterLauncher.class); - private final ThreadPoolExecutor launcherPool; + private ThreadPoolExecutor launcherPool; private LauncherThread launcherHandlingThread; private final BlockingQueue masterEvents @@ -45,11 +50,30 @@ public class ApplicationMasterLauncher extends AbstractService implements public ApplicationMasterLauncher(RMContext context) { super(ApplicationMasterLauncher.class.getName()); this.context = context; - this.launcherPool = new ThreadPoolExecutor(10, 10, 1, - TimeUnit.HOURS, new LinkedBlockingQueue()); this.launcherHandlingThread = new LauncherThread(); } + @Override + protected void serviceInit(Configuration conf) throws Exception { + int threadCount = conf.getInt( + YarnConfiguration.RM_AMLAUNCHER_THREAD_COUNT, + YarnConfiguration.DEFAULT_RM_AMLAUNCHER_THREAD_COUNT); + ThreadFactory tf = new ThreadFactoryBuilder() + .setNameFormat("ApplicationMasterLauncher #%d") + .build(); + launcherPool = new ThreadPoolExecutor(threadCount, threadCount, 1, + TimeUnit.HOURS, new LinkedBlockingQueue()); + launcherPool.setThreadFactory(tf); + + Configuration newConf = new YarnConfiguration(conf); + newConf.setInt(CommonConfigurationKeysPublic. + IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, + conf.getInt(YarnConfiguration.RM_NODEMANAGER_CONNECT_RETIRES, + YarnConfiguration.DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES)); + setConfig(newConf); + super.serviceInit(newConf); + } + @Override protected void serviceStart() throws Exception { launcherHandlingThread.start();