YARN-5204. Properly report status of killed/stopped queued containers. (Konstantinos Karanasos via asuresh)

This commit is contained in:
Arun Suresh 2016-06-08 08:31:32 -07:00
parent 8c8a377cac
commit 3344ba70e0
2 changed files with 117 additions and 31 deletions

View File

@ -175,8 +175,9 @@ public class QueuingContainerManagerImpl extends ContainerManagerImpl {
}
nodeStatusUpdater.sendOutofBandHeartBeat();
} else {
super.stopContainerInternal(containerID);
}
super.stopContainerInternal(containerID);
}
/**
@ -456,6 +457,18 @@ public class QueuingContainerManagerImpl extends ContainerManagerImpl {
ContainerExitStatus.INVALID, this.context.getQueuingContext()
.getQueuedContainers().get(containerID).getResource(),
executionType);
} else {
// Check if part of the stopped/killed queued containers.
for (ContainerTokenIdentifier cTokenId : this.context
.getQueuingContext().getKilledQueuedContainers().keySet()) {
if (cTokenId.getContainerID().equals(containerID)) {
return BuilderUtils.newContainerStatus(containerID,
org.apache.hadoop.yarn.api.records.ContainerState.COMPLETE,
this.context.getQueuingContext().getKilledQueuedContainers()
.get(cTokenId), ContainerExitStatus.ABORTED, cTokenId
.getResource(), cTokenId.getExecutionType());
}
}
}
}
return super.getContainerStatusInternal(containerID, nmTokenIdentifier);

View File

@ -24,13 +24,13 @@ import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
@ -41,15 +41,12 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.NMTokenIdentifier;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.MockResourceCalculatorPlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.MockResourceCalculatorProcessTree;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.junit.Assert;
import org.junit.Test;
@ -58,11 +55,6 @@ import org.junit.Test;
* Class for testing the {@link QueuingContainerManagerImpl}.
*/
public class TestQueuingContainerManager extends BaseContainerManagerTest {
interface HasResources {
boolean decide(Context context, ContainerId cId);
}
public TestQueuingContainerManager() throws UnsupportedFileSystemException {
super();
}
@ -78,18 +70,6 @@ public class TestQueuingContainerManager extends BaseContainerManagerTest {
DeletionService delSrvc) {
return new QueuingContainerManagerImpl(context, exec, delSrvc,
nodeStatusUpdater, metrics, dirsHandler) {
@Override
public void serviceInit(Configuration conf) throws Exception {
conf.set(
YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR,
MockResourceCalculatorPlugin.class.getCanonicalName());
conf.set(
YarnConfiguration.NM_CONTAINER_MON_PROCESS_TREE,
MockResourceCalculatorProcessTree.class.getCanonicalName());
super.serviceInit(conf);
}
@Override
public void
setBlockNewContainerRequests(boolean blockNewContainerRequests) {
@ -398,7 +378,7 @@ public class TestQueuingContainerManager extends BaseContainerManagerTest {
containerManager.startContainers(allRequests);
BaseContainerManagerTest.waitForNMContainerState(containerManager,
createContainerId(0), ContainerState.DONE, 30);
createContainerId(0), ContainerState.DONE, 40);
Thread.sleep(5000);
// Get container statuses. Container 0 should be killed, container 1
@ -429,7 +409,7 @@ public class TestQueuingContainerManager extends BaseContainerManagerTest {
// Make sure the remaining OPPORTUNISTIC container starts its execution.
BaseContainerManagerTest.waitForNMContainerState(containerManager,
createContainerId(2), ContainerState.DONE, 30);
createContainerId(2), ContainerState.DONE, 40);
Thread.sleep(5000);
statRequest = GetContainerStatusesRequest.newInstance(Arrays.asList(
createContainerId(1)));
@ -488,13 +468,12 @@ public class TestQueuingContainerManager extends BaseContainerManagerTest {
containerManager.startContainers(allRequests);
BaseContainerManagerTest.waitForNMContainerState(containerManager,
createContainerId(0), ContainerState.DONE, 30);
createContainerId(0), ContainerState.DONE, 40);
Thread.sleep(5000);
// Get container statuses. Container 0 should be killed, container 1
// should be queued and container 2 should be running.
int killedContainers = 0;
int runningContainers = 0;
List<ContainerId> statList = new ArrayList<ContainerId>();
for (int i = 0; i < 4; i++) {
statList.add(createContainerId(i));
@ -508,14 +487,108 @@ public class TestQueuingContainerManager extends BaseContainerManagerTest {
"Container killed by the ApplicationMaster")) {
killedContainers++;
}
if (status.getState() ==
org.apache.hadoop.yarn.api.records.ContainerState.RUNNING) {
runningContainers++;
}
System.out.println("\nStatus : [" + status + "]\n");
}
Assert.assertEquals(2, killedContainers);
Assert.assertEquals(2, runningContainers);
}
/**
* Start running one GUARANTEED container and queue two OPPORTUNISTIC ones.
* Try killing one of the two queued containers.
* @throws Exception
*/
@Test
public void testStopQueuedContainer() throws Exception {
shouldDeleteWait = true;
containerManager.start();
ContainerLaunchContext containerLaunchContext =
recordFactory.newRecordInstance(ContainerLaunchContext.class);
List<StartContainerRequest> list = new ArrayList<>();
list.add(StartContainerRequest.newInstance(
containerLaunchContext,
createContainerToken(createContainerId(0), DUMMY_RM_IDENTIFIER,
context.getNodeId(),
user, BuilderUtils.newResource(2048, 1),
context.getContainerTokenSecretManager(), null,
ExecutionType.GUARANTEED)));
list.add(StartContainerRequest.newInstance(
containerLaunchContext,
createContainerToken(createContainerId(1), DUMMY_RM_IDENTIFIER,
context.getNodeId(),
user, BuilderUtils.newResource(512, 1),
context.getContainerTokenSecretManager(), null,
ExecutionType.OPPORTUNISTIC)));
list.add(StartContainerRequest.newInstance(
containerLaunchContext,
createContainerToken(createContainerId(2), DUMMY_RM_IDENTIFIER,
context.getNodeId(),
user, BuilderUtils.newResource(512, 1),
context.getContainerTokenSecretManager(), null,
ExecutionType.OPPORTUNISTIC)));
StartContainersRequest allRequests =
StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
Thread.sleep(2000);
// Assert there is initially one container running and two queued.
int runningContainersNo = 0;
int queuedContainersNo = 0;
List<ContainerId> statList = new ArrayList<ContainerId>();
for (int i = 0; i < 3; i++) {
statList.add(createContainerId(i));
}
GetContainerStatusesRequest statRequest = GetContainerStatusesRequest
.newInstance(statList);
List<ContainerStatus> containerStatuses = containerManager
.getContainerStatuses(statRequest).getContainerStatuses();
for (ContainerStatus status : containerStatuses) {
if (status.getState() ==
org.apache.hadoop.yarn.api.records.ContainerState.RUNNING) {
runningContainersNo++;
} else if (status.getState() ==
org.apache.hadoop.yarn.api.records.ContainerState.QUEUED) {
queuedContainersNo++;
}
System.out.println("\nStatus : [" + status + "]\n");
}
Assert.assertEquals(1, runningContainersNo);
Assert.assertEquals(2, queuedContainersNo);
// Stop one of the two queued containers.
StopContainersRequest stopRequest = StopContainersRequest.
newInstance(Arrays.asList(createContainerId(1)));
containerManager.stopContainers(stopRequest);
Thread.sleep(2000);
// Assert queued container got properly stopped.
statList.clear();
for (int i = 0; i < 3; i++) {
statList.add(createContainerId(i));
}
statRequest = GetContainerStatusesRequest.newInstance(statList);
containerStatuses = containerManager.getContainerStatuses(statRequest)
.getContainerStatuses();
for (ContainerStatus status : containerStatuses) {
if (status.getContainerId().equals(createContainerId(0))) {
Assert.assertEquals(
org.apache.hadoop.yarn.api.records.ContainerState.RUNNING,
status.getState());
} else if (status.getContainerId().equals(createContainerId(1))) {
Assert.assertTrue(status.getDiagnostics().contains(
"Queued container request removed"));
} else if (status.getContainerId().equals(createContainerId(2))) {
Assert.assertEquals(
org.apache.hadoop.yarn.api.records.ContainerState.QUEUED,
status.getState());
}
System.out.println("\nStatus : [" + status + "]\n");
}
}
}