YARN-4414. Nodemanager connection errors are retried at multiple levels. Contributed by Chang Li
(cherry picked from commit 13de8359a1
)
This commit is contained in:
parent
7c419bc9e5
commit
a8292161fc
|
@ -1182,6 +1182,9 @@ Release 2.7.3 - UNRELEASED
|
|||
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
|
||||
network exception. (Raju Bairishetti via jianhe)
|
||||
|
||||
YARN-4414. Nodemanager connection errors are retried at multiple levels
|
||||
(Chang Li via jlowe)
|
||||
|
||||
Release 2.7.2 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
@ -2048,6 +2051,9 @@ Release 2.6.4 - UNRELEASED
|
|||
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
|
||||
network exception. (Raju Bairishetti via jianhe)
|
||||
|
||||
YARN-4414. Nodemanager connection errors are retried at multiple levels
|
||||
(Chang Li via jlowe)
|
||||
|
||||
Release 2.6.3 - 2015-12-17
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.net.InetSocketAddress;
|
|||
import org.apache.hadoop.classification.InterfaceAudience.Public;
|
||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.io.retry.RetryPolicy;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
|
@ -42,8 +43,12 @@ public class NMProxy extends ServerProxy {
|
|||
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS,
|
||||
YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS,
|
||||
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS);
|
||||
|
||||
return createRetriableProxy(conf, protocol, ugi, rpc, serverAddress,
|
||||
Configuration confClone = new Configuration(conf);
|
||||
confClone.setInt(
|
||||
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 0);
|
||||
confClone.setInt(CommonConfigurationKeysPublic.
|
||||
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 0);
|
||||
return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress,
|
||||
retryPolicy);
|
||||
}
|
||||
}
|
|
@ -77,6 +77,7 @@ public class ServerProxy {
|
|||
exceptionToPolicyMap.put(ConnectException.class, retryPolicy);
|
||||
exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy);
|
||||
exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy);
|
||||
exceptionToPolicyMap.put(ConnectTimeoutException.class, retryPolicy);
|
||||
exceptionToPolicyMap.put(RetriableException.class, retryPolicy);
|
||||
exceptionToPolicyMap.put(SocketException.class, retryPolicy);
|
||||
exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy);
|
||||
|
|
|
@ -21,6 +21,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager;
|
|||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.io.retry.UnreliableInterface;
|
||||
import org.apache.hadoop.security.SecurityUtil;
|
||||
|
@ -127,7 +129,7 @@ public class TestNMProxy extends BaseContainerManagerTest {
|
|||
StartContainersRequest allRequests =
|
||||
Records.newRecord(StartContainersRequest.class);
|
||||
|
||||
ContainerManagementProtocol proxy = getNMProxy();
|
||||
ContainerManagementProtocol proxy = getNMProxy(conf);
|
||||
|
||||
retryCount = 0;
|
||||
shouldThrowNMNotYetReadyException = false;
|
||||
|
@ -158,14 +160,40 @@ public class TestNMProxy extends BaseContainerManagerTest {
|
|||
StartContainersRequest allRequests =
|
||||
Records.newRecord(StartContainersRequest.class);
|
||||
|
||||
ContainerManagementProtocol proxy = getNMProxy();
|
||||
ContainerManagementProtocol proxy = getNMProxy(conf);
|
||||
|
||||
shouldThrowNMNotYetReadyException = false;
|
||||
retryCount = 0;
|
||||
proxy.startContainers(allRequests);
|
||||
}
|
||||
|
||||
private ContainerManagementProtocol getNMProxy() {
|
||||
@Test(timeout = 20000)
|
||||
public void testNMProxyRPCRetry() throws Exception {
|
||||
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 1000);
|
||||
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100);
|
||||
StartContainersRequest allRequests =
|
||||
Records.newRecord(StartContainersRequest.class);
|
||||
Configuration newConf = new YarnConfiguration(conf);
|
||||
newConf.setInt(
|
||||
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 100);
|
||||
|
||||
newConf.setInt(CommonConfigurationKeysPublic.
|
||||
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 100);
|
||||
// connect to some dummy address so that it can trigger
|
||||
// connection failure and RPC level retires.
|
||||
newConf.set(YarnConfiguration.NM_ADDRESS, "1234");
|
||||
ContainerManagementProtocol proxy = getNMProxy(newConf);
|
||||
try {
|
||||
proxy.startContainers(allRequests);
|
||||
Assert.fail("should get socket exception");
|
||||
} catch (IOException e) {
|
||||
// socket exception should be thrown immediately, without RPC retries.
|
||||
Assert.assertTrue(e.toString().
|
||||
contains("Failed on local exception: java.net.SocketException"));
|
||||
}
|
||||
}
|
||||
|
||||
private ContainerManagementProtocol getNMProxy(Configuration conf) {
|
||||
ApplicationId appId = ApplicationId.newInstance(1, 1);
|
||||
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
|
||||
|
||||
|
|
Loading…
Reference in New Issue