diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 1c6345e4ef7..f5a4879df24 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -163,6 +163,9 @@ Release 2.7.1 - UNRELEASED
YARN-3842. NMProxy should retry on NMNotYetReadyException.
(Robert Kanter via kasha)
+ YARN-3809. Failed to launch new attempts because
+ ApplicationMasterLauncher's threads all hang (Jun Gong via jlowe)
+
Release 2.7.0 - 2015-04-20
INCOMPATIBLE CHANGES
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index b1f378b2a25..da076eb410c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -143,6 +143,16 @@ public class YarnConfiguration extends Configuration {
RM_PREFIX + "client.thread-count";
public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50;
+ /** Number of threads used to launch/cleanup AM.*/
+ public static final String RM_AMLAUNCHER_THREAD_COUNT =
+ RM_PREFIX + "amlauncher.thread-count";
+ public static final int DEFAULT_RM_AMLAUNCHER_THREAD_COUNT = 50;
+
+ /** Retry times to connect with NM.*/
+ public static final String RM_NODEMANAGER_CONNECT_RETIRES =
+ RM_PREFIX + "nodemanager-connect-retries";
+ public static final int DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES = 10;
+
/** The Kerberos principal for the resource manager.*/
public static final String RM_PRINCIPAL =
RM_PREFIX + "principal";
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 5d0f07d8f5d..079d4d2817d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -76,6 +76,18 @@
50
+
+ Number of threads used to launch/cleanup AM.
+ yarn.resourcemanager.amlauncher.thread-count
+ 50
+
+
+
+ Retry times to connect with NM.
+ yarn.resourcemanager.nodemanager-connect-retries
+ 10
+
+
The expiry interval for application master reporting.
yarn.am.liveness-monitor.expiry-interval-ms
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
index 5fc39fd2efc..f606e45fdf4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
@@ -19,12 +19,17 @@
package org.apache.hadoop.yarn.server.resourcemanager.amlauncher;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.service.AbstractService;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
@@ -34,7 +39,7 @@ public class ApplicationMasterLauncher extends AbstractService implements
EventHandler {
private static final Log LOG = LogFactory.getLog(
ApplicationMasterLauncher.class);
- private final ThreadPoolExecutor launcherPool;
+ private ThreadPoolExecutor launcherPool;
private LauncherThread launcherHandlingThread;
private final BlockingQueue masterEvents
@@ -45,11 +50,30 @@ public class ApplicationMasterLauncher extends AbstractService implements
public ApplicationMasterLauncher(RMContext context) {
super(ApplicationMasterLauncher.class.getName());
this.context = context;
- this.launcherPool = new ThreadPoolExecutor(10, 10, 1,
- TimeUnit.HOURS, new LinkedBlockingQueue());
this.launcherHandlingThread = new LauncherThread();
}
+ @Override
+ protected void serviceInit(Configuration conf) throws Exception {
+ int threadCount = conf.getInt(
+ YarnConfiguration.RM_AMLAUNCHER_THREAD_COUNT,
+ YarnConfiguration.DEFAULT_RM_AMLAUNCHER_THREAD_COUNT);
+ ThreadFactory tf = new ThreadFactoryBuilder()
+ .setNameFormat("ApplicationMasterLauncher #%d")
+ .build();
+ launcherPool = new ThreadPoolExecutor(threadCount, threadCount, 1,
+ TimeUnit.HOURS, new LinkedBlockingQueue());
+ launcherPool.setThreadFactory(tf);
+
+ Configuration newConf = new YarnConfiguration(conf);
+ newConf.setInt(CommonConfigurationKeysPublic.
+ IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+ conf.getInt(YarnConfiguration.RM_NODEMANAGER_CONNECT_RETIRES,
+ YarnConfiguration.DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES));
+ setConfig(newConf);
+ super.serviceInit(newConf);
+ }
+
@Override
protected void serviceStart() throws Exception {
launcherHandlingThread.start();