YARN-4414. Nodemanager connection errors are retried at multiple levels. Contributed by Chang Li
(cherry picked from commit 13de8359a1
)
Conflicts:
hadoop-yarn-project/CHANGES.txt
This commit is contained in:
parent
a30b8ef59e
commit
4b829c41ce
|
@ -57,6 +57,9 @@ Release 2.7.3 - UNRELEASED
|
||||||
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
|
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
|
||||||
network exception. (Raju Bairishetti via jianhe)
|
network exception. (Raju Bairishetti via jianhe)
|
||||||
|
|
||||||
|
YARN-4414. Nodemanager connection errors are retried at multiple levels
|
||||||
|
(Chang Li via jlowe)
|
||||||
|
|
||||||
Release 2.7.2 - UNRELEASED
|
Release 2.7.2 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
@ -931,6 +934,9 @@ Release 2.6.4 - UNRELEASED
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
||||||
|
YARN-4414. Nodemanager connection errors are retried at multiple levels
|
||||||
|
(Chang Li via jlowe)
|
||||||
|
|
||||||
Release 2.6.3 - 2015-12-17
|
Release 2.6.3 - 2015-12-17
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.net.InetSocketAddress;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Public;
|
import org.apache.hadoop.classification.InterfaceAudience.Public;
|
||||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||||
import org.apache.hadoop.io.retry.RetryPolicy;
|
import org.apache.hadoop.io.retry.RetryPolicy;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
@ -42,8 +43,12 @@ public class NMProxy extends ServerProxy {
|
||||||
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS,
|
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_MAX_WAIT_MS,
|
||||||
YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS,
|
YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS,
|
||||||
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS);
|
YarnConfiguration.DEFAULT_CLIENT_NM_CONNECT_RETRY_INTERVAL_MS);
|
||||||
|
Configuration confClone = new Configuration(conf);
|
||||||
return createRetriableProxy(conf, protocol, ugi, rpc, serverAddress,
|
confClone.setInt(
|
||||||
|
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 0);
|
||||||
|
confClone.setInt(CommonConfigurationKeysPublic.
|
||||||
|
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 0);
|
||||||
|
return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress,
|
||||||
retryPolicy);
|
retryPolicy);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -76,6 +76,7 @@ public class ServerProxy {
|
||||||
exceptionToPolicyMap.put(ConnectException.class, retryPolicy);
|
exceptionToPolicyMap.put(ConnectException.class, retryPolicy);
|
||||||
exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy);
|
exceptionToPolicyMap.put(NoRouteToHostException.class, retryPolicy);
|
||||||
exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy);
|
exceptionToPolicyMap.put(UnknownHostException.class, retryPolicy);
|
||||||
|
exceptionToPolicyMap.put(ConnectTimeoutException.class, retryPolicy);
|
||||||
exceptionToPolicyMap.put(RetriableException.class, retryPolicy);
|
exceptionToPolicyMap.put(RetriableException.class, retryPolicy);
|
||||||
exceptionToPolicyMap.put(SocketException.class, retryPolicy);
|
exceptionToPolicyMap.put(SocketException.class, retryPolicy);
|
||||||
exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy);
|
exceptionToPolicyMap.put(NMNotYetReadyException.class, retryPolicy);
|
||||||
|
|
|
@ -21,6 +21,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
import org.apache.hadoop.io.retry.UnreliableInterface;
|
import org.apache.hadoop.io.retry.UnreliableInterface;
|
||||||
import org.apache.hadoop.security.SecurityUtil;
|
import org.apache.hadoop.security.SecurityUtil;
|
||||||
|
@ -128,7 +130,7 @@ public class TestNMProxy extends BaseContainerManagerTest {
|
||||||
StartContainersRequest allRequests =
|
StartContainersRequest allRequests =
|
||||||
Records.newRecord(StartContainersRequest.class);
|
Records.newRecord(StartContainersRequest.class);
|
||||||
|
|
||||||
ContainerManagementProtocol proxy = getNMProxy();
|
ContainerManagementProtocol proxy = getNMProxy(conf);
|
||||||
|
|
||||||
retryCount = 0;
|
retryCount = 0;
|
||||||
shouldThrowNMNotYetReadyException = false;
|
shouldThrowNMNotYetReadyException = false;
|
||||||
|
@ -159,14 +161,40 @@ public class TestNMProxy extends BaseContainerManagerTest {
|
||||||
StartContainersRequest allRequests =
|
StartContainersRequest allRequests =
|
||||||
Records.newRecord(StartContainersRequest.class);
|
Records.newRecord(StartContainersRequest.class);
|
||||||
|
|
||||||
ContainerManagementProtocol proxy = getNMProxy();
|
ContainerManagementProtocol proxy = getNMProxy(conf);
|
||||||
|
|
||||||
shouldThrowNMNotYetReadyException = false;
|
shouldThrowNMNotYetReadyException = false;
|
||||||
retryCount = 0;
|
retryCount = 0;
|
||||||
proxy.startContainers(allRequests);
|
proxy.startContainers(allRequests);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ContainerManagementProtocol getNMProxy() {
|
@Test(timeout = 20000)
|
||||||
|
public void testNMProxyRPCRetry() throws Exception {
|
||||||
|
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 1000);
|
||||||
|
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100);
|
||||||
|
StartContainersRequest allRequests =
|
||||||
|
Records.newRecord(StartContainersRequest.class);
|
||||||
|
Configuration newConf = new YarnConfiguration(conf);
|
||||||
|
newConf.setInt(
|
||||||
|
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 100);
|
||||||
|
|
||||||
|
newConf.setInt(CommonConfigurationKeysPublic.
|
||||||
|
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY, 100);
|
||||||
|
// connect to some dummy address so that it can trigger
|
||||||
|
// connection failure and RPC level retires.
|
||||||
|
newConf.set(YarnConfiguration.NM_ADDRESS, "1234");
|
||||||
|
ContainerManagementProtocol proxy = getNMProxy(newConf);
|
||||||
|
try {
|
||||||
|
proxy.startContainers(allRequests);
|
||||||
|
Assert.fail("should get socket exception");
|
||||||
|
} catch (IOException e) {
|
||||||
|
// socket exception should be thrown immediately, without RPC retries.
|
||||||
|
Assert.assertTrue(e.toString().
|
||||||
|
contains("Failed on local exception: java.net.SocketException"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private ContainerManagementProtocol getNMProxy(Configuration conf) {
|
||||||
ApplicationId appId = ApplicationId.newInstance(1, 1);
|
ApplicationId appId = ApplicationId.newInstance(1, 1);
|
||||||
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
|
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue