YARN-4254. ApplicationAttempt stuck for ever due to UnknownHostException. Contributed by Bibin A Chundatt

This commit is contained in:
Jason Lowe 2018-10-05 15:52:46 -05:00
parent c968365650
commit 9bb2801e8c
4 changed files with 81 additions and 1 deletions

View File

@ -541,7 +541,14 @@ public class YarnConfiguration extends Configuration {
public static final String RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT =
RM_PREFIX + "resource-tracker.client.thread-count";
public static final int DEFAULT_RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT = 50;
/** Check IP and hostname resolution during nodemanager registration.*/
public static final String RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY =
RM_PREFIX + "resource-tracker.nm.ip-hostname-check";
public static final boolean DEFAULT_RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY =
false;
/** The class to use as the resource scheduler.*/
public static final String RM_SCHEDULER =
RM_PREFIX + "scheduler.class";

View File

@ -264,6 +264,11 @@
<value>${yarn.resourcemanager.hostname}:8031</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.nm.ip-hostname-check</name>
<value>false</value>
</property>
<property>
<description>Are acls enabled.</description>
<name>yarn.acl.enable</name>

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.nio.ByteBuffer;
import java.util.Arrays;
@ -39,6 +40,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.security.authorize.PolicyProvider;
import org.apache.hadoop.service.AbstractService;
@ -126,6 +128,7 @@ public class ResourceTrackerService extends AbstractService implements
private DynamicResourceConfiguration drConf;
private final AtomicLong timelineCollectorVersion = new AtomicLong(0);
private boolean checkIpHostnameInRegistration;
public ResourceTrackerService(RMContext rmContext,
NodesListManager nodesListManager,
@ -162,6 +165,9 @@ public class ResourceTrackerService extends AbstractService implements
+ " should be larger than 0.");
}
checkIpHostnameInRegistration = conf.getBoolean(
YarnConfiguration.RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY,
YarnConfiguration.DEFAULT_RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY);
minAllocMb = conf.getInt(
YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB,
YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB);
@ -350,6 +356,23 @@ public class ResourceTrackerService extends AbstractService implements
}
}
if (checkIpHostnameInRegistration) {
InetSocketAddress nmAddress =
NetUtils.createSocketAddrForHost(host, cmPort);
InetAddress inetAddress = Server.getRemoteIp();
if (inetAddress != null && nmAddress.isUnresolved()) {
// Reject registration of unresolved nm to prevent resourcemanager
// getting stuck at allocations.
final String message =
"hostname cannot be resolved (ip=" + inetAddress.getHostAddress()
+ ", hostname=" + host + ")";
LOG.warn("Unresolved nodemanager registration: " + message);
response.setDiagnosticsMessage(message);
response.setNodeAction(NodeAction.SHUTDOWN);
return response;
}
}
// Check if this node is a 'valid' node
if (!this.nodesListManager.isValidNode(host) &&
!isNodeInDecommissioning(nodeId)) {

View File

@ -18,7 +18,10 @@
package org.apache.hadoop.yarn.server.resourcemanager;
import org.apache.hadoop.net.ServerSocketUtil;
import org.apache.hadoop.yarn.nodelabels.NodeAttributeStore;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.api.ServerRMProxy;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.FileSystemNodeAttributeStore;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock;
@ -2402,4 +2405,46 @@ public class TestResourceTrackerService extends NodeLabelTestBase {
Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
Assert.assertEquals(1, nodeHeartbeat.getResponseId());
}
@Test
public void testNMIpHostNameResolution() throws Exception {
Configuration conf = new Configuration();
conf.set(YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
"localhost:" + ServerSocketUtil.getPort(10000, 10));
conf.setBoolean(YarnConfiguration.RM_NM_REGISTRATION_IP_HOSTNAME_CHECK_KEY,
true);
MockRM mockRM = new MockRM(conf) {
@Override
protected ResourceTrackerService createResourceTrackerService() {
return new ResourceTrackerService(getRMContext(), nodesListManager,
this.nmLivelinessMonitor,
rmContext.getContainerTokenSecretManager(),
rmContext.getNMTokenSecretManager()) {
};
}
};
mockRM.start();
ResourceTracker rmTracker =
ServerRMProxy.createRMProxy(mockRM.getConfig(), ResourceTracker.class);
RegisterNodeManagerResponse response = rmTracker.registerNodeManager(
RegisterNodeManagerRequest.newInstance(
NodeId.newInstance("host1" + System.currentTimeMillis(), 1234),
1236, Resource.newInstance(10000, 10), "2", new ArrayList<>(),
new ArrayList<>()));
Assert
.assertEquals("Shutdown signal should be received", NodeAction.SHUTDOWN,
response.getNodeAction());
Assert.assertTrue("Diagnostic Message", response.getDiagnosticsMessage()
.contains("hostname cannot be resolved "));
// Test success
rmTracker =
ServerRMProxy.createRMProxy(mockRM.getConfig(), ResourceTracker.class);
response = rmTracker.registerNodeManager(RegisterNodeManagerRequest
.newInstance(NodeId.newInstance("localhost", 1234), 1236,
Resource.newInstance(10000, 10), "2", new ArrayList<>(),
new ArrayList<>()));
Assert.assertEquals("Successfull registration", NodeAction.NORMAL,
response.getNodeAction());
mockRM.stop();
}
}