YARN-4414. Nodemanager connection errors are retried at multiple levels. Contributed by Chang Li

(cherry picked from commit 13de8359a1)

Conflicts:

	hadoop-yarn-project/CHANGES.txt
This commit is contained in:
Jason Lowe 2016-01-12 15:59:14 +00:00
parent a30b8ef59e
commit 4b829c41ce
4 changed files with 45 additions and 5 deletions

View File

@ -57,6 +57,9 @@ Release 2.7.3 - UNRELEASED
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
network exception. (Raju Bairishetti via jianhe) network exception. (Raju Bairishetti via jianhe)
YARN-4414. Nodemanager connection errors are retried at multiple levels
(Chang Li via jlowe)
Release 2.7.2 - UNRELEASED Release 2.7.2 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
@ -931,6 +934,9 @@ Release 2.6.4 - UNRELEASED
BUG FIXES BUG FIXES
YARN-4414. Nodemanager connection errors are retried at multiple levels
(Chang Li via jlowe)
Release 2.6.3 - 2015-12-17 Release 2.6.3 - 2015-12-17
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -23,6 +23,7 @@ import java.net.InetSocketAddress;
import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
@ -42,8 +43,12 @@ public class NMProxy extends ServerProxy {
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS, YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS,
YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS,
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS); YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS);
Configuration confClone = new Configuration(conf);
return createRetriableProxy(conf, protocol, ugi, rpc, serverAddress, confClone.setInt(
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 0);
confClone.setInt(CommonConfigurationKeysPublic.
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 0);
return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress,
retryPolicy); retryPolicy);
} }
} }

View File

@ -76,6 +76,7 @@ public class ServerProxy {
exceptionToPolicyMap.put(ConnectException.class, retryPolicy); exceptionToPolicyMap.put(ConnectException.class, retryPolicy);
exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy); exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy);
exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy); exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy);
exceptionToPolicyMap.put(ConnectTimeoutException.class, retryPolicy);
exceptionToPolicyMap.put(RetriableException.class, retryPolicy); exceptionToPolicyMap.put(RetriableException.class, retryPolicy);
exceptionToPolicyMap.put(SocketException.class, retryPolicy); exceptionToPolicyMap.put(SocketException.class, retryPolicy);
exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy); exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy);

View File

@ -21,6 +21,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager;
import java.io.IOException; import java.io.IOException;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.io.retry.UnreliableInterface; import org.apache.hadoop.io.retry.UnreliableInterface;
import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.SecurityUtil;
@ -128,7 +130,7 @@ public class TestNMProxy extends BaseContainerManagerTest {
StartContainersRequest allRequests = StartContainersRequest allRequests =
Records.newRecord(StartContainersRequest.class); Records.newRecord(StartContainersRequest.class);
ContainerManagementProtocol proxy = getNMProxy(); ContainerManagementProtocol proxy = getNMProxy(conf);
retryCount = 0; retryCount = 0;
shouldThrowNMNotYetReadyException = false; shouldThrowNMNotYetReadyException = false;
@ -159,14 +161,40 @@ public class TestNMProxy extends BaseContainerManagerTest {
StartContainersRequest allRequests = StartContainersRequest allRequests =
Records.newRecord(StartContainersRequest.class); Records.newRecord(StartContainersRequest.class);
ContainerManagementProtocol proxy = getNMProxy(); ContainerManagementProtocol proxy = getNMProxy(conf);
shouldThrowNMNotYetReadyException = false; shouldThrowNMNotYetReadyException = false;
retryCount = 0; retryCount = 0;
proxy.startContainers(allRequests); proxy.startContainers(allRequests);
} }
private ContainerManagementProtocol getNMProxy() { @Test(timeout = 20000)
public void testNMProxyRPCRetry() throws Exception {
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 1000);
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100);
StartContainersRequest allRequests =
Records.newRecord(StartContainersRequest.class);
Configuration newConf = new YarnConfiguration(conf);
newConf.setInt(
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 100);
newConf.setInt(CommonConfigurationKeysPublic.
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 100);
// connect to some dummy address so that it can trigger
// connection failure and RPC level retires.
newConf.set(YarnConfiguration.NM_ADDRESS, "1234");
ContainerManagementProtocol proxy = getNMProxy(newConf);
try {
proxy.startContainers(allRequests);
Assert.fail("should get socket exception");
} catch (IOException e) {
// socket exception should be thrown immediately, without RPC retries.
Assert.assertTrue(e.toString().
contains("Failed on local exception: java.net.SocketException"));
}
}
private ContainerManagementProtocol getNMProxy(Configuration conf) {
ApplicationId appId = ApplicationId.newInstance(1, 1); ApplicationId appId = ApplicationId.newInstance(1, 1);
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1); ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);