diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 2fb50759e2e..d4f23b6f2d2 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -505,6 +505,9 @@ Release 2.8.0 - UNRELEASED YARN-3790. usedResource from rootQueue metrics may get stale data for FS scheduler after recovering the container (Zhihai Xu via rohithsharmaks) + YARN-3826. Race condition in ResourceTrackerService leads to + wrong diagnostics messages. (Chengbing Liu via devaraj) + Release 2.7.1 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java index 8bdff6291d6..f3331855597 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java @@ -22,13 +22,11 @@ import java.util.List; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; -import org.apache.hadoop.yarn.api.records.SerializedException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeAction; -import org.apache.hadoop.yarn.util.Records; /** * Server Builder utilities to construct various objects. @@ -39,6 +37,15 @@ public class YarnServerBuilderUtils { private static final RecordFactory recordFactory = RecordFactoryProvider .getRecordFactory(null); + public static NodeHeartbeatResponse newNodeHeartbeatResponse( + NodeAction action, String diagnosticsMessage) { + NodeHeartbeatResponse response = recordFactory + .newRecordInstance(NodeHeartbeatResponse.class); + response.setNodeAction(action); + response.setDiagnosticsMessage(diagnosticsMessage); + return response; + } + public static NodeHeartbeatResponse newNodeHeartbeatResponse(int responseId, NodeAction action, List containersToCleanUp, List applicationsToCleanUp, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index aa372549817..3c2c09b8c1e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -100,22 +100,11 @@ public class ResourceTrackerService extends AbstractService implements private InetSocketAddress resourceTrackerAddress; private String minimumNodeManagerVersion; - private static final NodeHeartbeatResponse resync = recordFactory - .newRecordInstance(NodeHeartbeatResponse.class); - private static final NodeHeartbeatResponse shutDown = recordFactory - .newRecordInstance(NodeHeartbeatResponse.class); - private int minAllocMb; private int minAllocVcores; private boolean isDistributedNodeLabelsConf; - static { - resync.setNodeAction(NodeAction.RESYNC); - - shutDown.setNodeAction(NodeAction.SHUTDOWN); - } - public ResourceTrackerService(RMContext rmContext, NodesListManager nodesListManager, NMLivelinessMonitor nmLivelinessMonitor, @@ -414,8 +403,8 @@ public class ResourceTrackerService extends AbstractService implements "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId.getHost(); LOG.info(message); - shutDown.setDiagnosticsMessage(message); - return shutDown; + return YarnServerBuilderUtils.newNodeHeartbeatResponse( + NodeAction.SHUTDOWN, message); } // 2. Check if it's a registered node @@ -424,8 +413,8 @@ public class ResourceTrackerService extends AbstractService implements /* node does not exist */ String message = "Node not found resyncing " + remoteNodeStatus.getNodeId(); LOG.info(message); - resync.setDiagnosticsMessage(message); - return resync; + return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, + message); } // Send ping @@ -445,11 +434,11 @@ public class ResourceTrackerService extends AbstractService implements + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + remoteNodeStatus.getResponseId(); LOG.info(message); - resync.setDiagnosticsMessage(message); // TODO: Just sending reboot is not enough. Think more. this.rmContext.getDispatcher().getEventHandler().handle( new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING)); - return resync; + return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, + message); } // Heartbeat response