YARN-4254. ApplicationAttempt stuck for ever due to UnknownHostException. Contributed by Bibin A Chundatt
This commit is contained in:
parent
c968365650
commit
9bb2801e8c
|
@ -541,7 +541,14 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final String RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT =
|
||||
RM_PREFIX + "resource-tracker.client.thread-count";
|
||||
public static final int DEFAULT_RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT = 50;
|
||||
|
||||
|
||||
/** Check IP and hostname resolution during nodemanager registration.*/
|
||||
public static final String RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY =
|
||||
RM_PREFIX + "resource-tracker.nm.ip-hostname-check";
|
||||
|
||||
public static final boolean DEFAULT_RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY =
|
||||
false;
|
||||
|
||||
/** The class to use as the resource scheduler.*/
|
||||
public static final String RM_SCHEDULER =
|
||||
RM_PREFIX + "scheduler.class";
|
||||
|
|
|
@ -264,6 +264,11 @@
|
|||
<value>${yarn.resourcemanager.hostname}:8031</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.resourcemanager.resource-tracker.nm.ip-hostname-check</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Are acls enabled.</description>
|
||||
<name>yarn.acl.enable</name>
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
@ -39,6 +40,7 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.ipc.Server;
|
||||
import org.apache.hadoop.net.NetUtils;
|
||||
import org.apache.hadoop.net.Node;
|
||||
import org.apache.hadoop.security.authorize.PolicyProvider;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
|
@ -126,6 +128,7 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
private DynamicResourceConfiguration drConf;
|
||||
|
||||
private final AtomicLong timelineCollectorVersion = new AtomicLong(0);
|
||||
private boolean checkIpHostnameInRegistration;
|
||||
|
||||
public ResourceTrackerService(RMContext rmContext,
|
||||
NodesListManager nodesListManager,
|
||||
|
@ -162,6 +165,9 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
+ " should be larger than 0.");
|
||||
}
|
||||
|
||||
checkIpHostnameInRegistration = conf.getBoolean(
|
||||
YarnConfiguration.RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY,
|
||||
YarnConfiguration.DEFAULT_RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY);
|
||||
minAllocMb = conf.getInt(
|
||||
YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB,
|
||||
YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB);
|
||||
|
@ -350,6 +356,23 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
}
|
||||
}
|
||||
|
||||
if (checkIpHostnameInRegistration) {
|
||||
InetSocketAddress nmAddress =
|
||||
NetUtils.createSocketAddrForHost(host, cmPort);
|
||||
InetAddress inetAddress = Server.getRemoteIp();
|
||||
if (inetAddress != null && nmAddress.isUnresolved()) {
|
||||
// Reject registration of unresolved nm to prevent resourcemanager
|
||||
// getting stuck at allocations.
|
||||
final String message =
|
||||
"hostname cannot be resolved (ip=" + inetAddress.getHostAddress()
|
||||
+ ", hostname=" + host + ")";
|
||||
LOG.warn("Unresolved nodemanager registration: " + message);
|
||||
response.setDiagnosticsMessage(message);
|
||||
response.setNodeAction(NodeAction.SHUTDOWN);
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this node is a 'valid' node
|
||||
if (!this.nodesListManager.isValidNode(host) &&
|
||||
!isNodeInDecommissioning(nodeId)) {
|
||||
|
|
|
@ -18,7 +18,10 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.resourcemanager;
|
||||
|
||||
import org.apache.hadoop.net.ServerSocketUtil;
|
||||
import org.apache.hadoop.yarn.nodelabels.NodeAttributeStore;
|
||||
import org.apache.hadoop.yarn.server.api.ResourceTracker;
|
||||
import org.apache.hadoop.yarn.server.api.ServerRMProxy;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.FileSystemNodeAttributeStore;
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
@ -2402,4 +2405,46 @@ public class TestResourceTrackerService extends NodeLabelTestBase {
|
|||
Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
|
||||
Assert.assertEquals(1, nodeHeartbeat.getResponseId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNMIpHostNameResolution() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set(YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
|
||||
"localhost:" + ServerSocketUtil.getPort(10000, 10));
|
||||
conf.setBoolean(YarnConfiguration.RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY,
|
||||
true);
|
||||
MockRM mockRM = new MockRM(conf) {
|
||||
@Override
|
||||
protected ResourceTrackerService createResourceTrackerService() {
|
||||
return new ResourceTrackerService(getRMContext(), nodesListManager,
|
||||
this.nmLivelinessMonitor,
|
||||
rmContext.getContainerTokenSecretManager(),
|
||||
rmContext.getNMTokenSecretManager()) {
|
||||
};
|
||||
}
|
||||
};
|
||||
mockRM.start();
|
||||
ResourceTracker rmTracker =
|
||||
ServerRMProxy.createRMProxy(mockRM.getConfig(), ResourceTracker.class);
|
||||
RegisterNodeManagerResponse response = rmTracker.registerNodeManager(
|
||||
RegisterNodeManagerRequest.newInstance(
|
||||
NodeId.newInstance("host1" + System.currentTimeMillis(), 1234),
|
||||
1236, Resource.newInstance(10000, 10), "2", new ArrayList<>(),
|
||||
new ArrayList<>()));
|
||||
Assert
|
||||
.assertEquals("Shutdown signal should be received", NodeAction.SHUTDOWN,
|
||||
response.getNodeAction());
|
||||
Assert.assertTrue("Diagnostic Message", response.getDiagnosticsMessage()
|
||||
.contains("hostname cannot be resolved "));
|
||||
// Test success
|
||||
rmTracker =
|
||||
ServerRMProxy.createRMProxy(mockRM.getConfig(), ResourceTracker.class);
|
||||
response = rmTracker.registerNodeManager(RegisterNodeManagerRequest
|
||||
.newInstance(NodeId.newInstance("localhost", 1234), 1236,
|
||||
Resource.newInstance(10000, 10), "2", new ArrayList<>(),
|
||||
new ArrayList<>()));
|
||||
Assert.assertEquals("Successfull registration", NodeAction.NORMAL,
|
||||
response.getNodeAction());
|
||||
mockRM.stop();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue