YARN-2314. Disable ContainerManagementProtocolProxy cache by default to prevent creating thousands of threads in a large cluster. Contributed by Jason Lowe

This commit is contained in:
Jian He 2014-10-24 23:05:16 -07:00
parent 5864dd99a4
commit f44cf99599
4 changed files with 115 additions and 43 deletions

View File

@ -746,6 +746,10 @@ Release 2.6.0 - UNRELEASED
to contact with AM before AM actually receives the ClientToAMTokenMasterKey.
(Jason Lowe via jianhe)
YARN-2314. Disable ContainerManagementProtocolProxy cache by default to
prevent creating thousands of threads in a large cluster. (Jason Lowe via
jianhe)
Release 2.5.1 - 2014-09-05
INCOMPATIBLE CHANGES

View File

@ -70,10 +70,18 @@ public class YarnConfiguration extends Configuration {
public static final int APPLICATION_MAX_TAG_LENGTH = 100;
static {
addDeprecatedKeys();
Configuration.addDefaultResource(YARN_DEFAULT_CONFIGURATION_FILE);
Configuration.addDefaultResource(YARN_SITE_CONFIGURATION_FILE);
}
private static void addDeprecatedKeys() {
Configuration.addDeprecations(new DeprecationDelta[] {
new DeprecationDelta("yarn.client.max-nodemanagers-proxies",
NM_CLIENT_MAX_NM_PROXIES)
});
}
//Configurations
public static final String YARN_PREFIX = "yarn.";
@ -1446,21 +1454,27 @@ public class YarnConfiguration extends Configuration {
public static final int DEFAULT_NM_CLIENT_ASYNC_THREAD_POOL_MAX_SIZE = 500;
/**
* Maximum number of proxy connections for node manager. It should always be
* more than 1. NMClient and MRAppMaster will use this to cache connection
* with node manager. There will be at max one connection per node manager.
* Ex. configuring it to a value of 5 will make sure that client will at
* max have 5 connections cached with 5 different node managers. These
* connections will be timed out if idle for more than system wide idle
* timeout period. The token if used for authentication then it will be used
* only at connection creation time. If new token is received then earlier
* connection should be closed in order to use newer token.
* Note: {@link YarnConfiguration#NM_CLIENT_ASYNC_THREAD_POOL_MAX_SIZE}
* are related to each other.
* Maximum number of proxy connections to cache for node managers. If set
* to a value greater than zero then the cache is enabled and the NMClient
* and MRAppMaster will cache the specified number of node manager proxies.
* There will be at max one proxy per node manager. Ex. configuring it to a
* value of 5 will make sure that client will at max have 5 proxies cached
* with 5 different node managers. These connections for these proxies will
* be timed out if idle for more than the system wide idle timeout period.
* Note that this could cause issues on large clusters as many connections
* could linger simultaneously and lead to a large number of connection
* threads. The token used for authentication will be used only at
* connection creation time. If a new token is received then the earlier
* connection should be closed in order to use the new token. This and
* {@link YarnConfiguration#NM_CLIENT_ASYNC_THREAD_POOL_MAX_SIZE} are related
* and should be in sync (no need for them to be equal).
* If the value of this property is zero then the connection cache is
* disabled and connections will use a zero idle timeout to prevent too
* many connection threads on large clusters.
*/
public static final String NM_CLIENT_MAX_NM_PROXIES =
YARN_PREFIX + "client.max-nodemanagers-proxies";
public static final int DEFAULT_NM_CLIENT_MAX_NM_PROXIES = 500;
YARN_PREFIX + "client.max-cached-nodemanagers-proxies";
public static final int DEFAULT_NM_CLIENT_MAX_NM_PROXIES = 0;
/** Max time to wait to establish a connection to NM */
public static final String CLIENT_NM_CONNECT_MAX_WAIT_MS =

View File

@ -20,14 +20,17 @@
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
@ -53,7 +56,7 @@ public class ContainerManagementProtocolProxy {
static final Log LOG = LogFactory.getLog(ContainerManagementProtocolProxy.class);
private final int maxConnectedNMs;
private final LinkedHashMap<String, ContainerManagementProtocolProxyData> cmProxy;
private final Map<String, ContainerManagementProtocolProxyData> cmProxy;
private final Configuration conf;
private final YarnRPC rpc;
private NMTokenCache nmTokenCache;
@ -70,16 +73,25 @@ public ContainerManagementProtocolProxy(Configuration conf,
maxConnectedNMs =
conf.getInt(YarnConfiguration.NM_CLIENT_MAX_NM_PROXIES,
YarnConfiguration.DEFAULT_NM_CLIENT_MAX_NM_PROXIES);
if (maxConnectedNMs < 1) {
if (maxConnectedNMs < 0) {
throw new YarnRuntimeException(
YarnConfiguration.NM_CLIENT_MAX_NM_PROXIES
+ " (" + maxConnectedNMs + ") can not be less than 1.");
+ " (" + maxConnectedNMs + ") can not be less than 0.");
}
LOG.info(YarnConfiguration.NM_CLIENT_MAX_NM_PROXIES + " : "
+ maxConnectedNMs);
cmProxy =
new LinkedHashMap<String, ContainerManagementProtocolProxyData>();
if (maxConnectedNMs > 0) {
cmProxy =
new LinkedHashMap<String, ContainerManagementProtocolProxyData>();
} else {
cmProxy = Collections.emptyMap();
// Connections are not being cached so ensure connections close quickly
// to avoid creating thousands of RPC client threads on large clusters.
conf.setInt(
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY,
0);
}
rpc = YarnRPC.create(conf);
}
@ -117,13 +129,9 @@ public synchronized ContainerManagementProtocolProxyData getProxy(
proxy =
new ContainerManagementProtocolProxyData(rpc, containerManagerBindAddr,
containerId, nmTokenCache.getToken(containerManagerBindAddr));
if (cmProxy.size() > maxConnectedNMs) {
// Number of existing proxy exceed the limit.
String cmAddr = cmProxy.keySet().iterator().next();
removeProxy(cmProxy.get(cmAddr));
if (maxConnectedNMs > 0) {
addProxyToCache(containerManagerBindAddr, proxy);
}
cmProxy.put(containerManagerBindAddr, proxy);
}
// This is to track active users of this proxy.
proxy.activeCallers++;
@ -131,15 +139,52 @@ public synchronized ContainerManagementProtocolProxyData getProxy(
return proxy;
}
private void addProxyToCache(String containerManagerBindAddr,
ContainerManagementProtocolProxyData proxy) {
while (cmProxy.size() >= maxConnectedNMs) {
if (LOG.isDebugEnabled()) {
LOG.debug("Cleaning up the proxy cache, size=" + cmProxy.size()
+ " max=" + maxConnectedNMs);
}
boolean removedProxy = false;
for (ContainerManagementProtocolProxyData otherProxy : cmProxy.values()) {
removedProxy = removeProxy(otherProxy);
if (removedProxy) {
break;
}
}
if (!removedProxy) {
// all of the proxies are currently in use and already scheduled
// for removal, so we need to wait until at least one of them closes
try {
this.wait();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
if (maxConnectedNMs > 0) {
cmProxy.put(containerManagerBindAddr, proxy);
}
}
private void updateLRUCache(String containerManagerBindAddr) {
ContainerManagementProtocolProxyData proxy =
cmProxy.remove(containerManagerBindAddr);
cmProxy.put(containerManagerBindAddr, proxy);
if (maxConnectedNMs > 0) {
ContainerManagementProtocolProxyData proxy =
cmProxy.remove(containerManagerBindAddr);
cmProxy.put(containerManagerBindAddr, proxy);
}
}
public synchronized void mayBeCloseProxy(
ContainerManagementProtocolProxyData proxy) {
tryCloseProxy(proxy);
}
private boolean tryCloseProxy(
ContainerManagementProtocolProxyData proxy) {
proxy.activeCallers--;
if (proxy.scheduledForClose && proxy.activeCallers < 0) {
LOG.info("Closing proxy : " + proxy.containerManagerBindAddr);
@ -149,15 +194,18 @@ public synchronized void mayBeCloseProxy(
} finally {
this.notifyAll();
}
return true;
}
return false;
}
private synchronized void removeProxy(
private synchronized boolean removeProxy(
ContainerManagementProtocolProxyData proxy) {
if (!proxy.scheduledForClose) {
proxy.scheduledForClose = true;
mayBeCloseProxy(proxy);
return tryCloseProxy(proxy);
}
return false;
}
public synchronized void stopAllProxies() {

View File

@ -1095,21 +1095,27 @@
</property>
<property>
<description>
Maximum number of proxy connections for node manager. It should always be
more than 1. NMClient and MRAppMaster will use this to cache connection
with node manager. There will be at max one connection per node manager.
Ex. configuring it to a value of 5 will make sure that client will at
max have 5 connections cached with 5 different node managers. These
connections will be timed out if idle for more than system wide idle
timeout period. The token if used for authentication then it will be used
only at connection creation time. If new token is received then earlier
connection should be closed in order to use newer token. This and
<description>
Maximum number of proxy connections to cache for node managers. If set
to a value greater than zero then the cache is enabled and the NMClient
and MRAppMaster will cache the specified number of node manager proxies.
There will be at max one proxy per node manager. Ex. configuring it to a
value of 5 will make sure that client will at max have 5 proxies cached
with 5 different node managers. These connections for these proxies will
be timed out if idle for more than the system wide idle timeout period.
Note that this could cause issues on large clusters as many connections
could linger simultaneously and lead to a large number of connection
threads. The token used for authentication will be used only at
connection creation time. If a new token is received then the earlier
connection should be closed in order to use the new token. This and
(yarn.client.nodemanager-client-async.thread-pool-max-size) are related
and should be sync (no need for them to be equal).
</description>
<name>yarn.client.max-nodemanagers-proxies</name>
<value>500</value>
and should be in sync (no need for them to be equal).
If the value of this property is zero then the connection cache is
disabled and connections will use a zero idle timeout to prevent too
many connection threads on large clusters.
</description>
<name>yarn.client.max-cached-nodemanagers-proxies</name>
<value>0</value>
</property>
<property>