YARN-4906. Capture container start/finish time in container metrics. Contributed by Jian He.
(cherry picked from commit b41e65e5bc
)
This commit is contained in:
parent
11e796b5cd
commit
8f9b97ccce
|
@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.even
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEventType;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
|
@ -100,6 +101,7 @@ public class ContainerImpl implements Container {
|
||||||
private boolean wasLaunched;
|
private boolean wasLaunched;
|
||||||
private long containerLocalizationStartTime;
|
private long containerLocalizationStartTime;
|
||||||
private long containerLaunchStartTime;
|
private long containerLaunchStartTime;
|
||||||
|
private ContainerMetrics containerMetrics;
|
||||||
private static Clock clock = SystemClock.getInstance();
|
private static Clock clock = SystemClock.getInstance();
|
||||||
|
|
||||||
/** The NM-wide configuration - not specific to this container */
|
/** The NM-wide configuration - not specific to this container */
|
||||||
|
@ -147,6 +149,21 @@ public class ContainerImpl implements Container {
|
||||||
this.readLock = readWriteLock.readLock();
|
this.readLock = readWriteLock.readLock();
|
||||||
this.writeLock = readWriteLock.writeLock();
|
this.writeLock = readWriteLock.writeLock();
|
||||||
this.context = context;
|
this.context = context;
|
||||||
|
boolean containerMetricsEnabled =
|
||||||
|
conf.getBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_ENABLE);
|
||||||
|
|
||||||
|
if (containerMetricsEnabled) {
|
||||||
|
long flushPeriod =
|
||||||
|
conf.getLong(YarnConfiguration.NM_CONTAINER_METRICS_PERIOD_MS,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_PERIOD_MS);
|
||||||
|
long unregisterDelay = conf.getLong(
|
||||||
|
YarnConfiguration.NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_METRICS_UNREGISTER_DELAY_MS);
|
||||||
|
containerMetrics = ContainerMetrics
|
||||||
|
.forContainer(containerId, flushPeriod, unregisterDelay);
|
||||||
|
containerMetrics.recordStartTime(clock.getTime());
|
||||||
|
}
|
||||||
|
|
||||||
stateMachine = stateMachineFactory.make(this);
|
stateMachine = stateMachineFactory.make(this);
|
||||||
}
|
}
|
||||||
|
@ -989,6 +1006,11 @@ public class ContainerImpl implements Container {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public void transition(ContainerImpl container, ContainerEvent event) {
|
public void transition(ContainerImpl container, ContainerEvent event) {
|
||||||
container.metrics.releaseContainer(container.resource);
|
container.metrics.releaseContainer(container.resource);
|
||||||
|
if (container.containerMetrics != null) {
|
||||||
|
container.containerMetrics
|
||||||
|
.recordFinishTimeAndExitCode(clock.getTime(), container.exitCode);
|
||||||
|
container.containerMetrics.finished();
|
||||||
|
}
|
||||||
container.sendFinishedEvents();
|
container.sendFinishedEvents();
|
||||||
//if the current state is NEW it means the CONTAINER_INIT was never
|
//if the current state is NEW it means the CONTAINER_INIT was never
|
||||||
// sent for the event, thus no need to send the CONTAINER_STOP
|
// sent for the event, thus no need to send the CONTAINER_STOP
|
||||||
|
|
|
@ -100,6 +100,15 @@ public class ContainerMetrics implements MetricsSource {
|
||||||
@Metric
|
@Metric
|
||||||
public MutableGaugeLong localizationDurationMs;
|
public MutableGaugeLong localizationDurationMs;
|
||||||
|
|
||||||
|
@Metric
|
||||||
|
public MutableGaugeLong startTime;
|
||||||
|
|
||||||
|
@Metric
|
||||||
|
public MutableGaugeLong finishTime;
|
||||||
|
|
||||||
|
@Metric
|
||||||
|
public MutableGaugeInt exitCode;
|
||||||
|
|
||||||
static final MetricsInfo RECORD_INFO =
|
static final MetricsInfo RECORD_INFO =
|
||||||
info("ContainerResource", "Resource limit and usage by container");
|
info("ContainerResource", "Resource limit and usage by container");
|
||||||
|
|
||||||
|
@ -277,6 +286,15 @@ public class ContainerMetrics implements MetricsSource {
|
||||||
this.localizationDurationMs.set(localizationDuration);
|
this.localizationDurationMs.set(localizationDuration);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void recordStartTime(long startTime) {
|
||||||
|
this.startTime.set(startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void recordFinishTimeAndExitCode(long finishTime, int exitCode) {
|
||||||
|
this.finishTime.set(finishTime);
|
||||||
|
this.exitCode.set(exitCode);
|
||||||
|
}
|
||||||
|
|
||||||
private synchronized void scheduleTimerTaskIfRequired() {
|
private synchronized void scheduleTimerTaskIfRequired() {
|
||||||
if (flushPeriodMs > 0) {
|
if (flushPeriodMs > 0) {
|
||||||
// Lazily initialize timer
|
// Lazily initialize timer
|
||||||
|
|
|
@ -195,7 +195,7 @@ public class TestAuxServices {
|
||||||
ContainerId.newContainerId(attemptId, 1), "", "",
|
ContainerId.newContainerId(attemptId, 1), "", "",
|
||||||
Resource.newInstance(1, 1), 0,0,0, Priority.newInstance(0), 0);
|
Resource.newInstance(1, 1), 0,0,0, Priority.newInstance(0), 0);
|
||||||
Context context = mock(Context.class);
|
Context context = mock(Context.class);
|
||||||
Container container = new ContainerImpl(null, null, null, null,
|
Container container = new ContainerImpl(new YarnConfiguration(), null, null, null,
|
||||||
null, cti, context);
|
null, cti, context);
|
||||||
ContainerId containerId = container.getContainerId();
|
ContainerId containerId = container.getContainerId();
|
||||||
Resource resource = container.getResource();
|
Resource resource = container.getResource();
|
||||||
|
|
|
@ -85,6 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.even
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEventType;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerMetrics;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEventType;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||||
|
@ -333,6 +334,7 @@ public class TestContainer {
|
||||||
@Test
|
@Test
|
||||||
public void testKillOnNew() throws Exception {
|
public void testKillOnNew() throws Exception {
|
||||||
WrappedContainer wc = null;
|
WrappedContainer wc = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
wc = new WrappedContainer(13, 314159265358979L, 4344, "yak");
|
wc = new WrappedContainer(13, 314159265358979L, 4344, "yak");
|
||||||
assertEquals(ContainerState.NEW, wc.c.getContainerState());
|
assertEquals(ContainerState.NEW, wc.c.getContainerState());
|
||||||
|
@ -345,6 +347,15 @@ public class TestContainer {
|
||||||
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
|
||||||
.contains("KillRequest"));
|
.contains("KillRequest"));
|
||||||
assertEquals(killed + 1, metrics.getKilledContainers());
|
assertEquals(killed + 1, metrics.getKilledContainers());
|
||||||
|
// check container metrics is generated.
|
||||||
|
ContainerMetrics containerMetrics =
|
||||||
|
ContainerMetrics.forContainer(wc.cId, 1, 5000);
|
||||||
|
Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
|
||||||
|
containerMetrics.exitCode.value());
|
||||||
|
Assert.assertTrue(containerMetrics.startTime.value() > 0);
|
||||||
|
Assert.assertTrue(
|
||||||
|
containerMetrics.finishTime.value() > containerMetrics.startTime
|
||||||
|
.value());
|
||||||
} finally {
|
} finally {
|
||||||
if (wc != null) {
|
if (wc != null) {
|
||||||
wc.finished();
|
wc.finished();
|
||||||
|
|
Loading…
Reference in New Issue