YARN-6849. NMContainerStatus should have the Container ExecutionType. (Kartheek Muthyala via asuresh)

This commit is contained in:
Arun Suresh 2017-09-08 09:24:05 -07:00
parent 4a83170be4
commit 1f53ae7972
12 changed files with 178 additions and 9 deletions

View File

@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.api.protocolrecords;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager; import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
@ -40,13 +41,14 @@ public abstract class NMContainerStatus {
long creationTime) { long creationTime) {
return newInstance(containerId, version, containerState, allocatedResource, return newInstance(containerId, version, containerState, allocatedResource,
diagnostics, containerExitStatus, priority, creationTime, diagnostics, containerExitStatus, priority, creationTime,
CommonNodeLabelsManager.NO_LABEL); CommonNodeLabelsManager.NO_LABEL, ExecutionType.GUARANTEED);
} }
public static NMContainerStatus newInstance(ContainerId containerId, public static NMContainerStatus newInstance(ContainerId containerId,
int version, ContainerState containerState, Resource allocatedResource, int version, ContainerState containerState, Resource allocatedResource,
String diagnostics, int containerExitStatus, Priority priority, String diagnostics, int containerExitStatus, Priority priority,
long creationTime, String nodeLabelExpression) { long creationTime, String nodeLabelExpression,
ExecutionType executionType) {
NMContainerStatus status = NMContainerStatus status =
Records.newRecord(NMContainerStatus.class); Records.newRecord(NMContainerStatus.class);
status.setContainerId(containerId); status.setContainerId(containerId);
@ -58,6 +60,7 @@ public abstract class NMContainerStatus {
status.setPriority(priority); status.setPriority(priority);
status.setCreationTime(creationTime); status.setCreationTime(creationTime);
status.setNodeLabelExpression(nodeLabelExpression); status.setNodeLabelExpression(nodeLabelExpression);
status.setExecutionType(executionType);
return status; return status;
} }
@ -134,4 +137,14 @@ public abstract class NMContainerStatus {
public void setVersion(int version) { public void setVersion(int version) {
} }
/**
* Get the <code>ExecutionType</code> of the container.
* @return <code>ExecutionType</code> of the container
*/
public ExecutionType getExecutionType() {
return ExecutionType.GUARANTEED;
}
public void setExecutionType(ExecutionType executionType) { }
} }

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.impl.pb.ContainerIdPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ContainerIdPBImpl;
@ -27,6 +28,7 @@ import org.apache.hadoop.yarn.api.records.impl.pb.PriorityPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils; import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils;
import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl;
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager; import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
import org.apache.hadoop.yarn.proto.YarnProtos;
import org.apache.hadoop.yarn.proto.YarnProtos.ContainerIdProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerIdProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ContainerStateProto; import org.apache.hadoop.yarn.proto.YarnProtos.ContainerStateProto;
import org.apache.hadoop.yarn.proto.YarnProtos.PriorityProto; import org.apache.hadoop.yarn.proto.YarnProtos.PriorityProto;
@ -249,6 +251,25 @@ public class NMContainerStatusPBImpl extends NMContainerStatus {
builder.setNodeLabelExpression(nodeLabelExpression); builder.setNodeLabelExpression(nodeLabelExpression);
} }
@Override
public synchronized ExecutionType getExecutionType() {
NMContainerStatusProtoOrBuilder p = viaProto ? proto : builder;
if (!p.hasExecutionType()) {
return ExecutionType.GUARANTEED;
}
return convertFromProtoFormat(p.getExecutionType());
}
@Override
public synchronized void setExecutionType(ExecutionType executionType) {
maybeInitBuilder();
if (executionType == null) {
builder.clearExecutionType();
return;
}
builder.setExecutionType(convertToProtoFormat(executionType));
}
private void mergeLocalToBuilder() { private void mergeLocalToBuilder() {
if (this.containerId != null if (this.containerId != null
&& !((ContainerIdPBImpl) containerId).getProto().equals( && !((ContainerIdPBImpl) containerId).getProto().equals(
@ -313,4 +334,13 @@ public class NMContainerStatusPBImpl extends NMContainerStatus {
private PriorityProto convertToProtoFormat(Priority t) { private PriorityProto convertToProtoFormat(Priority t) {
return ((PriorityPBImpl)t).getProto(); return ((PriorityPBImpl)t).getProto();
} }
private ExecutionType convertFromProtoFormat(
YarnProtos.ExecutionTypeProto e) {
return ProtoUtils.convertFromProtoFormat(e);
}
private YarnProtos.ExecutionTypeProto convertToProtoFormat(ExecutionType e) {
return ProtoUtils.convertToProtoFormat(e);
}
} }

View File

@ -174,6 +174,7 @@ message NMContainerStatusProto {
optional int64 creation_time = 7; optional int64 creation_time = 7;
optional string nodeLabelExpression = 8; optional string nodeLabelExpression = 8;
optional int32 version = 9; optional int32 version = 9;
optional ExecutionTypeProto executionType = 10 [default = GUARANTEED];
} }
message SCMUploaderNotifyRequestProto { message SCMUploaderNotifyRequestProto {

View File

@ -632,7 +632,8 @@ public class ContainerImpl implements Container {
getCurrentState(), getResource(), diagnostics.toString(), exitCode, getCurrentState(), getResource(), diagnostics.toString(), exitCode,
containerTokenIdentifier.getPriority(), containerTokenIdentifier.getPriority(),
containerTokenIdentifier.getCreationTime(), containerTokenIdentifier.getCreationTime(),
containerTokenIdentifier.getNodeLabelExpression()); containerTokenIdentifier.getNodeLabelExpression(),
containerTokenIdentifier.getExecutionType());
} finally { } finally {
this.readLock.unlock(); this.readLock.unlock();
} }

View File

@ -847,7 +847,8 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
containers = startEvent.getNMContainerStatuses(); containers = startEvent.getNMContainerStatuses();
if (containers != null && !containers.isEmpty()) { if (containers != null && !containers.isEmpty()) {
for (NMContainerStatus container : containers) { for (NMContainerStatus container : containers) {
if (container.getContainerState() == ContainerState.RUNNING) { if (container.getContainerState() == ContainerState.RUNNING ||
container.getContainerState() == ContainerState.SCHEDULED) {
rmNode.launchedContainers.add(container.getContainerId()); rmNode.launchedContainers.add(container.getContainerId());
} }
} }

View File

@ -529,6 +529,7 @@ public abstract class AbstractYarnScheduler
node.getHttpAddress(), status.getAllocatedResource(), node.getHttpAddress(), status.getAllocatedResource(),
status.getPriority(), null); status.getPriority(), null);
container.setVersion(status.getVersion()); container.setVersion(status.getVersion());
container.setExecutionType(status.getExecutionType());
ApplicationAttemptId attemptId = ApplicationAttemptId attemptId =
container.getId().getApplicationAttemptId(); container.getId().getApplicationAttemptId();
RMContainer rmContainer = new RMContainerImpl(container, RMContainer rmContainer = new RMContainerImpl(container,

View File

@ -38,6 +38,7 @@ import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
@ -525,6 +526,9 @@ public class AppSchedulingInfo {
} }
public void recoverContainer(RMContainer rmContainer, String partition) { public void recoverContainer(RMContainer rmContainer, String partition) {
if (rmContainer.getExecutionType() != ExecutionType.GUARANTEED) {
return;
}
try { try {
this.writeLock.lock(); this.writeLock.lock();
QueueMetrics metrics = queue.getMetrics(); QueueMetrics metrics = queue.getMetrics();

View File

@ -1131,9 +1131,11 @@ public class SchedulerApplicationAttempt implements SchedulableEntity {
} }
LOG.info("SchedulerAttempt " + getApplicationAttemptId() LOG.info("SchedulerAttempt " + getApplicationAttemptId()
+ " is recovering container " + rmContainer.getContainerId()); + " is recovering container " + rmContainer.getContainerId());
liveContainers.put(rmContainer.getContainerId(), rmContainer); addRMContainer(rmContainer.getContainerId(), rmContainer);
if (rmContainer.getExecutionType() == ExecutionType.GUARANTEED) {
attemptResourceUsage.incUsed(node.getPartition(), attemptResourceUsage.incUsed(node.getPartition(),
rmContainer.getContainer().getResource()); rmContainer.getContainer().getResource());
}
// resourceLimit: updated when LeafQueue#recoverContainer#allocateResource // resourceLimit: updated when LeafQueue#recoverContainer#allocateResource
// is called. // is called.

View File

@ -1802,7 +1802,9 @@ public class LeafQueue extends AbstractCSQueue {
if (rmContainer.getState().equals(RMContainerState.COMPLETED)) { if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
return; return;
} }
if (rmContainer.getExecutionType() != ExecutionType.GUARANTEED) {
return;
}
// Careful! Locking order is important! // Careful! Locking order is important!
try { try {
writeLock.lock(); writeLock.lock();

View File

@ -37,6 +37,7 @@ import org.apache.hadoop.security.authorize.AccessControlList;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.QueueACL; import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.QueueInfo; import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.yarn.api.records.QueueState;
@ -863,6 +864,9 @@ public class ParentQueue extends AbstractCSQueue {
if (rmContainer.getState().equals(RMContainerState.COMPLETED)) { if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
return; return;
} }
if (rmContainer.getExecutionType() != ExecutionType.GUARANTEED) {
return;
}
// Careful! Locking order is important! // Careful! Locking order is important!
try { try {

View File

@ -77,6 +77,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
@ -2097,7 +2098,8 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
NMContainerStatus containerReport = NMContainerStatus containerReport =
NMContainerStatus.newInstance(containerId, 0, containerState, NMContainerStatus.newInstance(containerId, 0, containerState,
Resource.newInstance(1024, 1), "recover container", 0, Resource.newInstance(1024, 1), "recover container", 0,
Priority.newInstance(0), 0, nodeLabelExpression); Priority.newInstance(0), 0, nodeLabelExpression,
ExecutionType.GUARANTEED);
return containerReport; return containerReport;
} }

View File

@ -29,8 +29,11 @@ import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -44,15 +47,19 @@ import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeLabel; import org.apache.hadoop.yarn.api.records.NodeLabel;
import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.DrainDispatcher;
import org.apache.hadoop.yarn.event.Event; import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventDispatcher; import org.apache.hadoop.yarn.event.EventDispatcher;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
@ -74,10 +81,14 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.server.timelineservice.collector.PerNodeTimelineCollectorsAuxService; import org.apache.hadoop.yarn.server.timelineservice.collector.PerNodeTimelineCollectorsAuxService;
import org.apache.hadoop.yarn.server.timelineservice.storage.FileSystemTimelineWriterImpl; import org.apache.hadoop.yarn.server.timelineservice.storage.FileSystemTimelineWriterImpl;
@ -2026,6 +2037,103 @@ public class TestResourceTrackerService extends NodeLabelTestBase {
} }
} }
@SuppressWarnings("unchecked")
@Test
public void testHandleOpportunisticContainerStatus() throws Exception{
final DrainDispatcher dispatcher = new DrainDispatcher();
YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED,
true);
rm = new MockRM(conf){
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
};
rm.start();
RMApp app = rm.submitApp(1024, true);
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt()
.getAppAttemptId();
ResourceTrackerService resourceTrackerService =
rm.getResourceTrackerService();
SchedulerApplicationAttempt applicationAttempt = null;
while (applicationAttempt == null) {
applicationAttempt =
((AbstractYarnScheduler)rm.getRMContext().getScheduler())
.getApplicationAttempt(appAttemptId);
Thread.sleep(100);
}
Resource currentConsumption = applicationAttempt.getCurrentConsumption();
Assert.assertEquals(Resource.newInstance(0, 0), currentConsumption);
Resource allocResources =
applicationAttempt.getQueue().getMetrics().getAllocatedResources();
Assert.assertEquals(Resource.newInstance(0, 0), allocResources);
RegisterNodeManagerRequest req = Records.newRecord(
RegisterNodeManagerRequest.class);
NodeId nodeId = NodeId.newInstance("host2", 1234);
Resource capability = BuilderUtils.newResource(1024, 1);
req.setResource(capability);
req.setNodeId(nodeId);
req.setHttpPort(1234);
req.setNMVersion(YarnVersionInfo.getVersion());
ContainerId c1 = ContainerId.newContainerId(appAttemptId, 1);
ContainerId c2 = ContainerId.newContainerId(appAttemptId, 2);
ContainerId c3 = ContainerId.newContainerId(appAttemptId, 3);
NMContainerStatus queuedOpp =
NMContainerStatus.newInstance(c1, 1, ContainerState.SCHEDULED,
Resource.newInstance(1024, 1), "Dummy Queued OC",
ContainerExitStatus.INVALID, Priority.newInstance(5), 1234, "",
ExecutionType.OPPORTUNISTIC);
NMContainerStatus runningOpp =
NMContainerStatus.newInstance(c2, 1, ContainerState.RUNNING,
Resource.newInstance(2048, 1), "Dummy Running OC",
ContainerExitStatus.INVALID, Priority.newInstance(6), 1234, "",
ExecutionType.OPPORTUNISTIC);
NMContainerStatus runningGuar =
NMContainerStatus.newInstance(c3, 1, ContainerState.RUNNING,
Resource.newInstance(2048, 1), "Dummy Running GC",
ContainerExitStatus.INVALID, Priority.newInstance(6), 1234, "",
ExecutionType.GUARANTEED);
req.setContainerStatuses(Arrays.asList(queuedOpp, runningOpp, runningGuar));
// trying to register a invalid node.
RegisterNodeManagerResponse response =
resourceTrackerService.registerNodeManager(req);
dispatcher.await();
Thread.sleep(2000);
dispatcher.await();
Assert.assertEquals(NodeAction.NORMAL, response.getNodeAction());
Collection<RMContainer> liveContainers = applicationAttempt
.getLiveContainers();
Assert.assertEquals(3, liveContainers.size());
Iterator<RMContainer> iter = liveContainers.iterator();
while (iter.hasNext()) {
RMContainer rc = iter.next();
Assert.assertEquals(
rc.getContainerId().equals(c3) ?
ExecutionType.GUARANTEED : ExecutionType.OPPORTUNISTIC,
rc.getExecutionType());
}
// Should only include GUARANTEED resources
currentConsumption = applicationAttempt.getCurrentConsumption();
Assert.assertEquals(Resource.newInstance(2048, 1), currentConsumption);
allocResources =
applicationAttempt.getQueue().getMetrics().getAllocatedResources();
Assert.assertEquals(Resource.newInstance(2048, 1), allocResources);
SchedulerNode schedulerNode =
rm.getRMContext().getScheduler().getSchedulerNode(nodeId);
Assert.assertNotNull(schedulerNode);
Resource nodeResources = schedulerNode.getAllocatedResource();
Assert.assertEquals(Resource.newInstance(2048, 1), nodeResources);
}
@Test(timeout = 60000) @Test(timeout = 60000)
public void testNodeHeartBeatResponseForUnknownContainerCleanUp() public void testNodeHeartBeatResponseForUnknownContainerCleanUp()
throws Exception { throws Exception {