YARN-4906. Capture container start/finish time in container metrics. Contributed by Jian He.

(cherry picked from commit b41e65e5bc)
This commit is contained in:
Varun Vasudev 2016-04-06 13:41:33 +05:30
parent 11e796b5cd
commit 8f9b97ccce
4 changed files with 52 additions and 1 deletions

View File

@ -65,6 +65,7 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent;
import org.apache.hadoop.yarn.server.nodemanager.Context;
@ -100,6 +101,7 @@ public class ContainerImpl implements Container {
private boolean wasLaunched;
private long containerLocalizationStartTime;
private long containerLaunchStartTime;
private ContainerMetrics containerMetrics;
private static Clock clock = SystemClock.getInstance();
/** The NM-wide configuration - not specific to this container */
@ -147,6 +149,21 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher,
this.readLock = readWriteLock.readLock();
this.writeLock = readWriteLock.writeLock();
this.context = context;
boolean containerMetricsEnabled =
conf.getBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE,
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_ENABLE);
if (containerMetricsEnabled) {
long flushPeriod =
conf.getLong(YarnConfiguration.NM_CONTAINER_METRICS_PERIOD_MS,
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_PERIOD_MS);
long unregisterDelay = conf.getLong(
YarnConfiguration.NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS,
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS);
containerMetrics = ContainerMetrics
.forContainer(containerId, flushPeriod, unregisterDelay);
containerMetrics.recordStartTime(clock.getTime());
}
stateMachine = stateMachineFactory.make(this);
}
@ -989,6 +1006,11 @@ static class ContainerDoneTransition implements
@SuppressWarnings("unchecked")
public void transition(ContainerImpl container, ContainerEvent event) {
container.metrics.releaseContainer(container.resource);
if (container.containerMetrics != null) {
container.containerMetrics
.recordFinishTimeAndExitCode(clock.getTime(), container.exitCode);
container.containerMetrics.finished();
}
container.sendFinishedEvents();
//if the current state is NEW it means the CONTAINER_INIT was never
// sent for the event, thus no need to send the CONTAINER_STOP

View File

@ -100,6 +100,15 @@ public class ContainerMetrics implements MetricsSource {
@Metric
public MutableGaugeLong localizationDurationMs;
@Metric
public MutableGaugeLong startTime;
@Metric
public MutableGaugeLong finishTime;
@Metric
public MutableGaugeInt exitCode;
static final MetricsInfo RECORD_INFO =
info("ContainerResource", "Resource limit and usage by container");
@ -277,6 +286,15 @@ public void recordStateChangeDurations(long launchDuration,
this.localizationDurationMs.set(localizationDuration);
}
public void recordStartTime(long startTime) {
this.startTime.set(startTime);
}
public void recordFinishTimeAndExitCode(long finishTime, int exitCode) {
this.finishTime.set(finishTime);
this.exitCode.set(exitCode);
}
private synchronized void scheduleTimerTaskIfRequired() {
if (flushPeriodMs > 0) {
// Lazily initialize timer

View File

@ -195,7 +195,7 @@ public void testAuxEventDispatch() {
ContainerId.newContainerId(attemptId, 1), "", "",
Resource.newInstance(1, 1), 0,0,0, Priority.newInstance(0), 0);
Context context = mock(Context.class);
Container container = new ContainerImpl(null, null, null, null,
Container container = new ContainerImpl(new YarnConfiguration(), null, null, null,
null, cti, context);
ContainerId containerId = container.getContainerId();
Resource resource = container.getResource();

View File

@ -85,6 +85,7 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEventType;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
@ -333,6 +334,7 @@ public void testCleanupOnKillRequest() throws Exception {
@Test
public void testKillOnNew() throws Exception {
WrappedContainer wc = null;
try {
wc = new WrappedContainer(13, 314159265358979L, 4344, "yak");
assertEquals(ContainerState.NEW, wc.c.getContainerState());
@ -345,6 +347,15 @@ public void testKillOnNew() throws Exception {
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
.contains("KillRequest"));
assertEquals(killed + 1, metrics.getKilledContainers());
// check container metrics is generated.
ContainerMetrics containerMetrics =
ContainerMetrics.forContainer(wc.cId, 1, 5000);
Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
containerMetrics.exitCode.value());
Assert.assertTrue(containerMetrics.startTime.value() > 0);
Assert.assertTrue(
containerMetrics.finishTime.value() > containerMetrics.startTime
.value());
} finally {
if (wc != null) {
wc.finished();