From 4b829c41cefa5c70fef584192a61ce7ded9dcf96 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 12 Jan 2016 15:59:14 +0000 Subject: [PATCH] YARN-4414. Nodemanager connection errors are retried at multiple levels. Contributed by Chang Li (cherry picked from commit 13de8359a1c6d9fc78cd5013c860c1086d86176f) Conflicts: hadoop-yarn-project/CHANGES.txt --- hadoop-yarn-project/CHANGES.txt | 6 ++++ .../apache/hadoop/yarn/client/NMProxy.java | 9 +++-- .../hadoop/yarn/client/ServerProxy.java | 1 + .../containermanager/TestNMProxy.java | 34 +++++++++++++++++-- 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index d59aeb81a05..8698eca08d6 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -57,6 +57,9 @@ Release 2.7.3 - UNRELEASED YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non network exception. (Raju Bairishetti via jianhe) + YARN-4414. Nodemanager connection errors are retried at multiple levels + (Chang Li via jlowe) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES @@ -931,6 +934,9 @@ Release 2.6.4 - UNRELEASED BUG FIXES + YARN-4414. Nodemanager connection errors are retried at multiple levels + (Chang Li via jlowe) + Release 2.6.3 - 2015-12-17 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/NMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/NMProxy.java index dd40b45ee59..68816bb4c4c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/NMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/NMProxy.java @@ -23,6 +23,7 @@ import java.net.InetSocketAddress; import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.conf.YarnConfiguration; @@ -42,8 +43,12 @@ public class NMProxy extends ServerProxy { YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS, YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS); - - return createRetriableProxy(conf, protocol, ugi, rpc, serverAddress, + Configuration confClone = new Configuration(conf); + confClone.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 0); + confClone.setInt(CommonConfigurationKeysPublic. + IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 0); + return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress, retryPolicy); } } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/ServerProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/ServerProxy.java index de7fc7d6d30..be3080133a0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/ServerProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/ServerProxy.java @@ -76,6 +76,7 @@ public class ServerProxy { exceptionToPolicyMap.put(ConnectException.class, retryPolicy); exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy); exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy); + exceptionToPolicyMap.put(ConnectTimeoutException.class, retryPolicy); exceptionToPolicyMap.put(RetriableException.class, retryPolicy); exceptionToPolicyMap.put(SocketException.class, retryPolicy); exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java index 102c9c62d31..937fcbf305e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java @@ -21,6 +21,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager; import java.io.IOException; import java.net.InetSocketAddress; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.io.retry.UnreliableInterface; import org.apache.hadoop.security.SecurityUtil; @@ -128,7 +130,7 @@ public class TestNMProxy extends BaseContainerManagerTest { StartContainersRequest allRequests = Records.newRecord(StartContainersRequest.class); - ContainerManagementProtocol proxy = getNMProxy(); + ContainerManagementProtocol proxy = getNMProxy(conf); retryCount = 0; shouldThrowNMNotYetReadyException = false; @@ -159,14 +161,40 @@ public class TestNMProxy extends BaseContainerManagerTest { StartContainersRequest allRequests = Records.newRecord(StartContainersRequest.class); - ContainerManagementProtocol proxy = getNMProxy(); + ContainerManagementProtocol proxy = getNMProxy(conf); shouldThrowNMNotYetReadyException = false; retryCount = 0; proxy.startContainers(allRequests); } - private ContainerManagementProtocol getNMProxy() { + @Test(timeout = 20000) + public void testNMProxyRPCRetry() throws Exception { + conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 1000); + conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100); + StartContainersRequest allRequests = + Records.newRecord(StartContainersRequest.class); + Configuration newConf = new YarnConfiguration(conf); + newConf.setInt( + CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 100); + + newConf.setInt(CommonConfigurationKeysPublic. + IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 100); + // connect to some dummy address so that it can trigger + // connection failure and RPC level retires. + newConf.set(YarnConfiguration.NM_ADDRESS, "1234"); + ContainerManagementProtocol proxy = getNMProxy(newConf); + try { + proxy.startContainers(allRequests); + Assert.fail("should get socket exception"); + } catch (IOException e) { + // socket exception should be thrown immediately, without RPC retries. + Assert.assertTrue(e.toString(). + contains("Failed on local exception: java.net.SocketException")); + } + } + + private ContainerManagementProtocol getNMProxy(Configuration conf) { ApplicationId appId = ApplicationId.newInstance(1, 1); ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);