YARN-309. Changed NodeManager to obtain heart-beat interval from the ResourceManager. Contributed by Xuan Gong.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1463346 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
806073867e
commit
7ca9fe73a0
|
@ -109,6 +109,9 @@ Release 2.0.5-beta - UNRELEASED
|
|||
YARN-475. Remove a unused constant in the public API -
|
||||
ApplicationConstants.AM_APP_ATTEMPT_ID_ENV. (Hitesh Shah via vinodkv)
|
||||
|
||||
YARN-309. Changed NodeManager to obtain heart-beat interval from the
|
||||
ResourceManager. (Xuan Gong via vinodkv)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
|
|
@ -219,6 +219,11 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final String DEFAULT_RM_SCHEDULER =
|
||||
"org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler";
|
||||
|
||||
/** RM set next Heartbeat interval for NM */
|
||||
public static final String RM_NM_HEARTBEAT_INTERVAL_MS =
|
||||
RM_PREFIX + "nodemanagers.heartbeat-interval-ms";
|
||||
public static final long DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS = 1000;
|
||||
|
||||
//Delegation token related keys
|
||||
public static final String DELEGATION_KEY_UPDATE_INTERVAL_KEY =
|
||||
RM_PREFIX + "delegation.key.update-interval";
|
||||
|
@ -329,12 +334,6 @@ public class YarnConfiguration extends Configuration {
|
|||
NM_PREFIX + "delete.thread-count";
|
||||
public static final int DEFAULT_NM_DELETE_THREAD_COUNT = 4;
|
||||
|
||||
// TODO: Should this instead be dictated by RM?
|
||||
/** Heartbeat interval to RM*/
|
||||
public static final String NM_TO_RM_HEARTBEAT_INTERVAL_MS =
|
||||
NM_PREFIX + "heartbeat.interval-ms";
|
||||
public static final int DEFAULT_NM_TO_RM_HEARTBEAT_INTERVAL_MS = 1000;
|
||||
|
||||
/** Keytab for NM.*/
|
||||
public static final String NM_KEYTAB = NM_PREFIX + "keytab";
|
||||
|
||||
|
|
|
@ -278,6 +278,12 @@
|
|||
<value>86400</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>The heart-beat interval in milliseconds for every NodeManager in the cluster.</description>
|
||||
<name>yarn.resourcemanager.nodemanagers.heartbeat-interval-ms</name>
|
||||
<value>1000</value>
|
||||
</property>
|
||||
|
||||
<!-- Node Manager Configs -->
|
||||
<property>
|
||||
<description>The address of the container manager in the NM.</description>
|
||||
|
@ -336,12 +342,6 @@
|
|||
<value>0</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Heartbeat interval to RM</description>
|
||||
<name>yarn.nodemanager.heartbeat.interval-ms</name>
|
||||
<value>1000</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Keytab for NM.</description>
|
||||
<name>yarn.nodemanager.keytab</name>
|
||||
|
|
|
@ -42,4 +42,7 @@ public interface NodeHeartbeatResponse {
|
|||
void addAllContainersToCleanup(List<ContainerId> containers);
|
||||
|
||||
void addAllApplicationsToCleanup(List<ApplicationId> applications);
|
||||
|
||||
long getNextHeartBeatInterval();
|
||||
void setNextHeartBeatInterval(long nextHeartBeatInterval);
|
||||
}
|
||||
|
|
|
@ -271,6 +271,18 @@ public class NodeHeartbeatResponsePBImpl extends ProtoBase<NodeHeartbeatResponse
|
|||
builder.addAllApplicationsToCleanup(iterable);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getNextHeartBeatInterval() {
|
||||
NodeHeartbeatResponseProtoOrBuilder p = viaProto ? proto : builder;
|
||||
return (p.getNextHeartBeatInterval());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextHeartBeatInterval(long nextHeartBeatInterval) {
|
||||
maybeInitBuilder();
|
||||
builder.setNextHeartBeatInterval(nextHeartBeatInterval);
|
||||
}
|
||||
|
||||
private ContainerIdPBImpl convertFromProtoFormat(ContainerIdProto p) {
|
||||
return new ContainerIdPBImpl(p);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.utils;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
|
||||
import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeAction;
|
||||
|
||||
/**
|
||||
* Server Builder utilities to construct various objects.
|
||||
*
|
||||
*/
|
||||
public class YarnServerBuilderUtils {
|
||||
|
||||
private static final RecordFactory recordFactory = RecordFactoryProvider
|
||||
.getRecordFactory(null);
|
||||
|
||||
public static NodeHeartbeatResponse newNodeHeartbeatResponse(int responseId,
|
||||
NodeAction action, List<ContainerId> containersToCleanUp,
|
||||
List<ApplicationId> applicationsToCleanUp,
|
||||
MasterKey masterKey, long nextHeartbeatInterval) {
|
||||
NodeHeartbeatResponse response = recordFactory
|
||||
.newRecordInstance(NodeHeartbeatResponse.class);
|
||||
response.setResponseId(responseId);
|
||||
response.setNodeAction(action);
|
||||
response.setMasterKey(masterKey);
|
||||
response.setNextHeartBeatInterval(nextHeartbeatInterval);
|
||||
if(containersToCleanUp != null) {
|
||||
response.addAllContainersToCleanup(containersToCleanUp);
|
||||
}
|
||||
if(applicationsToCleanUp != null) {
|
||||
response.addAllApplicationsToCleanup(applicationsToCleanUp);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
}
|
|
@ -47,4 +47,5 @@ message NodeHeartbeatResponseProto {
|
|||
optional NodeActionProto nodeAction = 3;
|
||||
repeated ContainerIdProto containers_to_cleanup = 4;
|
||||
repeated ApplicationIdProto applications_to_cleanup = 5;
|
||||
optional int64 nextHeartBeatInterval = 6;
|
||||
}
|
||||
|
|
|
@ -71,7 +71,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
private final Dispatcher dispatcher;
|
||||
|
||||
private NodeId nodeId;
|
||||
private long heartBeatInterval;
|
||||
private long nextHeartBeatInterval;
|
||||
private ResourceTracker resourceTracker;
|
||||
private InetSocketAddress rmAddress;
|
||||
private Resource totalResource;
|
||||
|
@ -103,9 +103,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
|
||||
YarnConfiguration.DEFAULT_RM_RESOURCE_TRACKER_ADDRESS,
|
||||
YarnConfiguration.DEFAULT_RM_RESOURCE_TRACKER_PORT);
|
||||
this.heartBeatInterval =
|
||||
conf.getLong(YarnConfiguration.NM_TO_RM_HEARTBEAT_INTERVAL_MS,
|
||||
YarnConfiguration.DEFAULT_NM_TO_RM_HEARTBEAT_INTERVAL_MS);
|
||||
|
||||
int memoryMb =
|
||||
conf.getInt(
|
||||
YarnConfiguration.NM_PMEM_MB, YarnConfiguration.DEFAULT_NM_PMEM_MB);
|
||||
|
@ -394,9 +392,6 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
while (!isStopped) {
|
||||
// Send heartbeat
|
||||
try {
|
||||
synchronized (heartbeatMonitor) {
|
||||
heartbeatMonitor.wait(heartBeatInterval);
|
||||
}
|
||||
NodeStatus nodeStatus = getNodeStatus();
|
||||
nodeStatus.setResponseId(lastHeartBeatID);
|
||||
|
||||
|
@ -409,7 +404,8 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
}
|
||||
NodeHeartbeatResponse response =
|
||||
resourceTracker.nodeHeartbeat(request);
|
||||
|
||||
//get next heartbeat interval from response
|
||||
nextHeartBeatInterval = response.getNextHeartBeatInterval();
|
||||
// See if the master-key has rolled over
|
||||
if (isSecurityEnabled()) {
|
||||
MasterKey updatedMasterKey = response.getMasterKey();
|
||||
|
@ -456,6 +452,17 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
// TODO Better error handling. Thread can die with the rest of the
|
||||
// NM still running.
|
||||
LOG.error("Caught exception in status-updater", e);
|
||||
} finally {
|
||||
synchronized (heartbeatMonitor) {
|
||||
nextHeartBeatInterval = nextHeartBeatInterval <= 0 ?
|
||||
YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS :
|
||||
nextHeartBeatInterval;
|
||||
try {
|
||||
heartbeatMonitor.wait(nextHeartBeatInterval);
|
||||
} catch (InterruptedException e) {
|
||||
// Do Nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequ
|
|||
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;
|
||||
|
||||
/**
|
||||
* This class allows a node manager to run without without communicating with a
|
||||
|
@ -73,9 +74,9 @@ public class MockNodeStatusUpdater extends NodeStatusUpdaterImpl {
|
|||
LOG.info("Got heartbeat number " + heartBeatID);
|
||||
nodeStatus.setResponseId(heartBeatID++);
|
||||
|
||||
NodeHeartbeatResponse nhResponse = recordFactory
|
||||
.newRecordInstance(NodeHeartbeatResponse.class);
|
||||
nhResponse.setResponseId(heartBeatID);
|
||||
NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils
|
||||
.newNodeHeartbeatResponse(heartBeatID, null, null,
|
||||
null, null, 1000L);
|
||||
return nhResponse;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -64,6 +64,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;
|
||||
import org.apache.hadoop.yarn.service.Service;
|
||||
import org.apache.hadoop.yarn.service.Service.STATE;
|
||||
import org.apache.hadoop.yarn.util.BuilderUtils;
|
||||
|
@ -214,9 +215,8 @@ public class TestNodeStatusUpdater {
|
|||
Assert.assertEquals(2, activeContainers.size());
|
||||
}
|
||||
|
||||
NodeHeartbeatResponse nhResponse = recordFactory
|
||||
.newRecordInstance(NodeHeartbeatResponse.class);
|
||||
nhResponse.setResponseId(heartBeatID);
|
||||
NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.
|
||||
newNodeHeartbeatResponse(heartBeatID, null, null, null, null, 1000L);
|
||||
return nhResponse;
|
||||
}
|
||||
}
|
||||
|
@ -325,10 +325,9 @@ public class TestNodeStatusUpdater {
|
|||
NodeStatus nodeStatus = request.getNodeStatus();
|
||||
nodeStatus.setResponseId(heartBeatID++);
|
||||
|
||||
NodeHeartbeatResponse nhResponse = recordFactory
|
||||
.newRecordInstance(NodeHeartbeatResponse.class);
|
||||
nhResponse.setResponseId(heartBeatID);
|
||||
nhResponse.setNodeAction(heartBeatNodeAction);
|
||||
NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.
|
||||
newNodeHeartbeatResponse(heartBeatID, heartBeatNodeAction, null,
|
||||
null, null, 1000L);
|
||||
return nhResponse;
|
||||
}
|
||||
}
|
||||
|
@ -361,10 +360,9 @@ public class TestNodeStatusUpdater {
|
|||
LOG.info("Got heartBeatId: [" + heartBeatID +"]");
|
||||
NodeStatus nodeStatus = request.getNodeStatus();
|
||||
nodeStatus.setResponseId(heartBeatID++);
|
||||
NodeHeartbeatResponse nhResponse =
|
||||
recordFactory.newRecordInstance(NodeHeartbeatResponse.class);
|
||||
nhResponse.setResponseId(heartBeatID);
|
||||
nhResponse.setNodeAction(heartBeatNodeAction);
|
||||
NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.
|
||||
newNodeHeartbeatResponse(heartBeatID, heartBeatNodeAction, null,
|
||||
null, null, 1000L);
|
||||
|
||||
if (nodeStatus.getKeepAliveApplications() != null
|
||||
&& nodeStatus.getKeepAliveApplications().size() > 0) {
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.hadoop.ipc.Server;
|
|||
import org.apache.hadoop.net.Node;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.security.authorize.PolicyProvider;
|
||||
import org.apache.hadoop.yarn.YarnException;
|
||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
|
@ -51,6 +52,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeReconnectEvent
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.security.authorize.RMPolicyProvider;
|
||||
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;
|
||||
import org.apache.hadoop.yarn.service.AbstractService;
|
||||
import org.apache.hadoop.yarn.util.RackResolver;
|
||||
|
||||
|
@ -67,6 +69,7 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
private final NMLivelinessMonitor nmLivelinessMonitor;
|
||||
private final RMContainerTokenSecretManager containerTokenSecretManager;
|
||||
|
||||
private long nextHeartBeatInterval;
|
||||
private Server server;
|
||||
private InetSocketAddress resourceTrackerAddress;
|
||||
|
||||
|
@ -100,6 +103,14 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
YarnConfiguration.DEFAULT_RM_RESOURCE_TRACKER_PORT);
|
||||
|
||||
RackResolver.init(conf);
|
||||
nextHeartBeatInterval =
|
||||
conf.getLong(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS,
|
||||
YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS);
|
||||
if (nextHeartBeatInterval <= 0) {
|
||||
throw new YarnException("Invalid Configuration. "
|
||||
+ YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS
|
||||
+ " should be larger than 0.");
|
||||
}
|
||||
super.init(conf);
|
||||
}
|
||||
|
||||
|
@ -224,9 +235,6 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
return shutDown;
|
||||
}
|
||||
|
||||
NodeHeartbeatResponse nodeHeartBeatResponse = recordFactory
|
||||
.newRecordInstance(NodeHeartbeatResponse.class);
|
||||
|
||||
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
||||
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
|
||||
if (remoteNodeStatus.getResponseId() + 1 == lastNodeHeartbeatResponse
|
||||
|
@ -246,10 +254,11 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
}
|
||||
|
||||
// Heartbeat response
|
||||
nodeHeartBeatResponse.setResponseId(lastNodeHeartbeatResponse.getResponseId() + 1);
|
||||
NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils
|
||||
.newNodeHeartbeatResponse(lastNodeHeartbeatResponse.
|
||||
getResponseId() + 1, NodeAction.NORMAL, null, null, null,
|
||||
nextHeartBeatInterval);
|
||||
rmNode.updateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse);
|
||||
nodeHeartBeatResponse.setNodeAction(NodeAction.NORMAL);
|
||||
|
||||
// Check if node's masterKey needs to be updated and if the currentKey has
|
||||
// roller over, send it across
|
||||
if (isSecurityEnabled()) {
|
||||
|
|
|
@ -53,6 +53,29 @@ public class TestResourceTrackerService {
|
|||
private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt");
|
||||
private MockRM rm;
|
||||
|
||||
/**
|
||||
* Test RM read NM next heartBeat Interval correctly from Configuration file,
|
||||
* and NM get next heartBeat Interval from RM correctly
|
||||
*/
|
||||
@Test (timeout = 5000)
|
||||
public void testGetNextHeartBeatInterval() throws Exception {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, "4000");
|
||||
|
||||
rm = new MockRM(conf);
|
||||
rm.start();
|
||||
|
||||
MockNM nm1 = rm.registerNode("host1:1234", 5120);
|
||||
MockNM nm2 = rm.registerNode("host2:5678", 10240);
|
||||
|
||||
NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||
Assert.assertEquals(4000, nodeHeartbeat.getNextHeartBeatInterval());
|
||||
|
||||
NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true);
|
||||
Assert.assertEquals(4000, nodeHeartbeat2.getNextHeartBeatInterval());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Decommissioning using a pre-configured include hosts file
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue