YARN-9428. Add metrics for paused containers in NodeManager. Contributed by Abhishek Modi.

This commit is contained in:
Giovanni Matteo Fumarola 2019-04-01 14:21:17 -07:00
parent da7f8c244d
commit ab2bda57bd
3 changed files with 31 additions and 0 deletions

View File

@ -161,6 +161,7 @@ private ReInitializationContext createContextForRollback() {
private final StringBuilder diagnostics;
private final int diagnosticsMaxSize;
private boolean wasLaunched;
private boolean wasPaused;
private long containerLocalizationStartTime;
private long containerLaunchStartTime;
private ContainerMetrics containerMetrics;
@ -1541,6 +1542,7 @@ static class RecoveredContainerTransition extends ContainerTransition {
public void transition(ContainerImpl container, ContainerEvent event) {
container.sendContainerMonitorStartEvent();
container.wasLaunched = true;
container.setIsPaused(true);
}
}
@ -1561,6 +1563,7 @@ public ExitedWithSuccessTransition(boolean clCleanupRequired) {
public void transition(ContainerImpl container, ContainerEvent event) {
container.setIsReInitializing(false);
container.setIsPaused(false);
// Set exit code to 0 on success
container.exitCode = 0;
@ -1591,6 +1594,7 @@ public ExitedWithFailureTransition(boolean clCleanupRequired) {
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
container.setIsPaused(false);
container.setIsReInitializing(false);
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
container.exitCode = exitEvent.getExitCode();
@ -1835,6 +1839,7 @@ static class KillTransition implements
public void transition(ContainerImpl container, ContainerEvent event) {
// Kill the process/process-grp
container.setIsReInitializing(false);
container.setIsPaused(false);
container.dispatcher.getEventHandler().handle(
new ContainersLauncherEvent(container,
ContainersLauncherEventType.CLEANUP_CONTAINER));
@ -2080,6 +2085,8 @@ static class PausedContainerTransition implements
SingleArcTransition<ContainerImpl, ContainerEvent> {
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
container.setIsPaused(true);
container.metrics.pausedContainer();
// Container was PAUSED so tell the scheduler
container.dispatcher.getEventHandler().handle(
new ContainerSchedulerEvent(container,
@ -2096,6 +2103,7 @@ static class ResumeContainerTransition implements
SingleArcTransition<ContainerImpl, ContainerEvent> {
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
container.setIsPaused(false);
// Pause the process/process-grp if it is supported by the container
container.dispatcher.getEventHandler().handle(
new ContainersLauncherEvent(container,
@ -2154,6 +2162,13 @@ private static boolean shouldBeUploadedToSharedCache(ContainerImpl container,
return container.resourceSet.getResourcesUploadPolicies().get(resource);
}
private void setIsPaused(boolean paused) {
if (this.wasPaused && !paused) {
this.metrics.endPausedContainer();
}
this.wasPaused = paused;
}
@VisibleForTesting
ContainerRetryContext getContainerRetryContext() {
return containerRetryContext;

View File

@ -44,6 +44,7 @@ public class NodeManagerMetrics {
@Metric("# of initializing containers")
MutableGaugeInt containersIniting;
@Metric MutableGaugeInt containersRunning;
@Metric("# of paused containers") MutableGaugeInt containersPaused;
@Metric("Current allocated memory in GB")
MutableGaugeInt allocatedGB;
@Metric("Current # of allocated containers")
@ -168,6 +169,14 @@ public void endReInitingContainer() {
containersReIniting.decr();
}
public void pausedContainer() {
containersPaused.incr();
}
public void endPausedContainer() {
containersPaused.decr();
}
public void allocateContainer(Resource res) {
allocatedContainers.incr();
allocatedMB = allocatedMB + res.getMemorySize();
@ -268,6 +277,10 @@ public int getRunningContainers() {
return containersRunning.value();
}
public int getPausedContainers() {
return containersPaused.value();
}
@VisibleForTesting
public int getKilledContainers() {
return containersKilled.value();

View File

@ -246,13 +246,16 @@ public void testContainerPauseAndResume() throws Exception {
wc.initContainer();
wc.localizeResources();
int running = metrics.getRunningContainers();
int paused = metrics.getPausedContainers();
wc.launchContainer();
assertEquals(running + 1, metrics.getRunningContainers());
reset(wc.localizerBus);
wc.pauseContainer();
assertEquals(ContainerState.PAUSED,
wc.c.getContainerState());
assertEquals(paused + 1, metrics.getPausedContainers());
wc.resumeContainer();
assertEquals(paused, metrics.getPausedContainers());
assertEquals(ContainerState.RUNNING,
wc.c.getContainerState());
wc.containerKilledOnRequest();