Merge 1601762 from trunk to branch-2 for YARN-2091. Add more values to ContainerExitStatus and pass it from NM to RM and then to app masters (Tsuyoshi OZAWA via bikas)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1601763 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d4d1ccc992
commit
dc5ee5ff7c
|
@ -140,6 +140,9 @@ Release 2.5.0 - UNRELEASED
|
|||
DummyApplicationResourceUsageReport for all invalid accesses.
|
||||
(Ray Chiang via kasha)
|
||||
|
||||
YARN-2091. Add more values to ContainerExitStatus and pass it from NM to
|
||||
RM and then to app masters (Tsuyoshi OZAWA via bikas)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
|
|
@ -46,4 +46,30 @@ public class ContainerExitStatus {
|
|||
* Containers preempted by the framework.
|
||||
*/
|
||||
public static final int PREEMPTED = -102;
|
||||
|
||||
/**
|
||||
* Container terminated because of exceeding allocated virtual memory.
|
||||
*/
|
||||
public static final int KILLED_EXCEEDED_VMEM = -103;
|
||||
|
||||
/**
|
||||
* Container terminated because of exceeding allocated physical memory.
|
||||
*/
|
||||
public static final int KILLED_EXCEEDED_PMEM = -104;
|
||||
|
||||
/**
|
||||
* Container was terminated by stop request by the app master.
|
||||
*/
|
||||
public static final int KILLED_BY_APPMASTER = -105;
|
||||
|
||||
/**
|
||||
* Container was terminated by the resource manager.
|
||||
*/
|
||||
public static final int KILLED_BY_RESOURCEMANAGER = -106;
|
||||
|
||||
/**
|
||||
* Container was terminated after the application finished.
|
||||
*/
|
||||
public static final int KILLED_AFTER_APP_COMPLETION = -107;
|
||||
|
||||
}
|
||||
|
|
|
@ -64,6 +64,7 @@ import org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse;
|
|||
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersResponse;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||
|
@ -738,7 +739,8 @@ public class ContainerManagerImpl extends CompositeService implements
|
|||
} else {
|
||||
dispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(containerID,
|
||||
"Container killed by the ApplicationMaster."));
|
||||
ContainerExitStatus.KILLED_BY_APPMASTER,
|
||||
"Container killed by the ApplicationMaster."));
|
||||
|
||||
NMAuditLogger.logSuccess(container.getUser(),
|
||||
AuditConstants.STOP_CONTAINER, "ContainerManageImpl", containerID
|
||||
|
@ -887,6 +889,7 @@ public class ContainerManagerImpl extends CompositeService implements
|
|||
.getContainersToCleanup()) {
|
||||
this.dispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(container,
|
||||
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
"Container Killed by ResourceManager"));
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.logaggregation.ContainerLogsRetentionPolicy;
|
||||
|
@ -375,6 +376,7 @@ public class ApplicationImpl implements Application {
|
|||
for (ContainerId containerID : app.containers.keySet()) {
|
||||
app.dispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(containerID,
|
||||
ContainerExitStatus.KILLED_AFTER_APP_COMPLETION,
|
||||
"Container killed on application-finish event: " + appEvent.getDiagnostic()));
|
||||
}
|
||||
return ApplicationState.FINISHING_CONTAINERS_WAIT;
|
||||
|
|
|
@ -48,7 +48,6 @@ import org.apache.hadoop.yarn.event.Dispatcher;
|
|||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServicesEvent;
|
||||
|
@ -773,7 +772,7 @@ public class ContainerImpl implements Container {
|
|||
container.cleanup();
|
||||
container.metrics.endInitingContainer();
|
||||
ContainerKillEvent killEvent = (ContainerKillEvent) event;
|
||||
container.exitCode = ExitCode.TERMINATED.getExitCode();
|
||||
container.exitCode = killEvent.getContainerExitStatus();
|
||||
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
|
||||
container.diagnostics.append("Container is killed before being launched.\n");
|
||||
}
|
||||
|
@ -817,6 +816,7 @@ public class ContainerImpl implements Container {
|
|||
ContainersLauncherEventType.CLEANUP_CONTAINER));
|
||||
ContainerKillEvent killEvent = (ContainerKillEvent) event;
|
||||
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
|
||||
container.exitCode = killEvent.getContainerExitStatus();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -829,7 +829,10 @@ public class ContainerImpl implements Container {
|
|||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
|
||||
container.exitCode = exitEvent.getExitCode();
|
||||
if (container.hasDefaultExitCode()) {
|
||||
container.exitCode = exitEvent.getExitCode();
|
||||
}
|
||||
|
||||
if (exitEvent.getDiagnosticInfo() != null) {
|
||||
container.diagnostics.append(exitEvent.getDiagnosticInfo())
|
||||
.append('\n');
|
||||
|
@ -871,7 +874,7 @@ public class ContainerImpl implements Container {
|
|||
@Override
|
||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||
ContainerKillEvent killEvent = (ContainerKillEvent) event;
|
||||
container.exitCode = ExitCode.TERMINATED.getExitCode();
|
||||
container.exitCode = killEvent.getContainerExitStatus();
|
||||
container.diagnostics.append(killEvent.getDiagnostic()).append("\n");
|
||||
container.diagnostics.append("Container is killed before being launched.\n");
|
||||
super.transition(container, event);
|
||||
|
@ -928,4 +931,9 @@ public class ContainerImpl implements Container {
|
|||
this.readLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasDefaultExitCode() {
|
||||
return (this.exitCode == ContainerExitStatus.INVALID);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,13 +23,21 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
|
|||
public class ContainerKillEvent extends ContainerEvent {
|
||||
|
||||
private final String diagnostic;
|
||||
private final int exitStatus;
|
||||
|
||||
public ContainerKillEvent(ContainerId cID, String diagnostic) {
|
||||
public ContainerKillEvent(ContainerId cID,
|
||||
int exitStatus, String diagnostic) {
|
||||
super(cID, ContainerEventType.KILL_CONTAINER);
|
||||
this.exitStatus = exitStatus;
|
||||
this.diagnostic = diagnostic;
|
||||
}
|
||||
|
||||
public String getDiagnostic() {
|
||||
return this.diagnostic;
|
||||
}
|
||||
|
||||
public int getContainerExitStatus() {
|
||||
return this.exitStatus;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||
|
@ -403,6 +404,7 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||
|
||||
boolean isMemoryOverLimit = false;
|
||||
String msg = "";
|
||||
int containerExitStatus = ContainerExitStatus.INVALID;
|
||||
if (isVmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
|
||||
|
@ -414,6 +416,7 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||
currentPmemUsage, pmemLimit,
|
||||
pId, containerId, pTree);
|
||||
isMemoryOverLimit = true;
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
|
||||
} else if (isPmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentPmemUsage, curRssMemUsageOfAgedProcesses,
|
||||
|
@ -426,6 +429,7 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||
currentPmemUsage, pmemLimit,
|
||||
pId, containerId, pTree);
|
||||
isMemoryOverLimit = true;
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||
}
|
||||
|
||||
if (isMemoryOverLimit) {
|
||||
|
@ -440,7 +444,8 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||
}
|
||||
// kill the container
|
||||
eventDispatcher.getEventHandler().handle(
|
||||
new ContainerKillEvent(containerId, msg));
|
||||
new ContainerKillEvent(containerId,
|
||||
containerExitStatus, msg));
|
||||
it.remove();
|
||||
LOG.info("Removed ProcessTree with root " + pId);
|
||||
} else {
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import org.junit.Assert;
|
||||
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -68,7 +69,6 @@ import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
|||
import org.apache.hadoop.yarn.security.NMTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestAuxServices.ServiceA;
|
||||
|
@ -348,8 +348,7 @@ public class TestContainerManager extends BaseContainerManagerTest {
|
|||
GetContainerStatusesRequest.newInstance(containerIds);
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
|
||||
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode();
|
||||
int expectedExitCode = ContainerExitStatus.KILLED_BY_APPMASTER;
|
||||
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
|
||||
|
||||
// Assert that the process is not alive anymore
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
*/
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.container;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
|
@ -319,7 +320,7 @@ public class TestContainer {
|
|||
assertEquals(ContainerState.NEW, wc.c.getContainerState());
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.DONE, wc.c.getContainerState());
|
||||
assertEquals(ExitCode.TERMINATED.getExitCode(),
|
||||
assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
wc.c.cloneAndGetContainerStatus().getExitStatus());
|
||||
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
||||
.contains("KillRequest"));
|
||||
|
@ -339,7 +340,7 @@ public class TestContainer {
|
|||
assertEquals(ContainerState.LOCALIZING, wc.c.getContainerState());
|
||||
wc.killContainer();
|
||||
assertEquals(ContainerState.KILLING, wc.c.getContainerState());
|
||||
assertEquals(ExitCode.TERMINATED.getExitCode(),
|
||||
assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
wc.c.cloneAndGetContainerStatus().getExitStatus());
|
||||
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
||||
.contains("KillRequest"));
|
||||
|
@ -898,12 +899,14 @@ public class TestContainer {
|
|||
}
|
||||
|
||||
public void killContainer() {
|
||||
c.handle(new ContainerKillEvent(cId, "KillRequest"));
|
||||
c.handle(new ContainerKillEvent(cId,
|
||||
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||
"KillRequest"));
|
||||
drainDispatcherEvents();
|
||||
}
|
||||
|
||||
public void containerKilledOnRequest() {
|
||||
int exitCode = ExitCode.FORCE_KILLED.getExitCode();
|
||||
int exitCode = ContainerExitStatus.KILLED_BY_RESOURCEMANAGER;
|
||||
String diagnosticMsg = "Container completed with exit code " + exitCode;
|
||||
c.handle(new ContainerExitEvent(cId,
|
||||
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode,
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import static org.junit.Assert.fail;
|
||||
|
@ -73,7 +74,6 @@ import org.apache.hadoop.yarn.event.Dispatcher;
|
|||
import org.apache.hadoop.yarn.event.Event;
|
||||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
|
@ -604,8 +604,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
|
|||
GetContainerStatusesRequest.newInstance(containerIds);
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
|
||||
int expectedExitCode = Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
|
||||
ExitCode.TERMINATED.getExitCode();
|
||||
int expectedExitCode = ContainerExitStatus.KILLED_BY_APPMASTER;
|
||||
Assert.assertEquals(expectedExitCode, containerStatus.getExitStatus());
|
||||
|
||||
// Assert that the process is not alive anymore
|
||||
|
@ -717,7 +716,7 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
|
|||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest)
|
||||
.getContainerStatuses().get(0);
|
||||
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
|
||||
Assert.assertEquals(ContainerExitStatus.KILLED_BY_APPMASTER,
|
||||
containerStatus.getExitStatus());
|
||||
|
||||
// Now verify the contents of the file. Script generates a message when it
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
@ -60,7 +61,6 @@ import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
|||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||
|
@ -270,7 +270,7 @@ public class TestContainersMonitor extends BaseContainerManagerTest {
|
|||
GetContainerStatusesRequest.newInstance(containerIds);
|
||||
ContainerStatus containerStatus =
|
||||
containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
|
||||
Assert.assertEquals(ExitCode.TERMINATED.getExitCode(),
|
||||
Assert.assertEquals(ContainerExitStatus.KILLED_EXCEEDED_VMEM,
|
||||
containerStatus.getExitStatus());
|
||||
String expectedMsgPattern =
|
||||
"Container \\[pid=" + pid + ",containerID=" + cId
|
||||
|
|
Loading…
Reference in New Issue