From 32b3b8e2ef5c2be8659d9b72f2fef867879d060a Mon Sep 17 00:00:00 2001 From: Robert Kanter Date: Fri, 1 May 2015 16:39:21 -0700 Subject: [PATCH] YARN-3363. add localization and container launch time to ContainerMetrics at NM to show these timing information for each active container. (zxu via rkanter) (cherry picked from commit ac7d152901e29b1f444507fe4e421eb6e1402b5a) --- hadoop-yarn-project/CHANGES.txt | 4 +++ .../container/ContainerImpl.java | 27 +++++++++++-------- .../monitor/ContainerMetrics.java | 20 ++++++++++++++ .../ContainerStartMonitoringEvent.java | 15 ++++++++++- .../monitor/ContainersMonitorImpl.java | 9 +++++++ .../monitor/TestContainerMetrics.java | 10 +++++++ 6 files changed, 73 insertions(+), 12 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index fe4eedcc7af..966d544754b 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -129,6 +129,10 @@ Release 2.8.0 - UNRELEASED YARN-3406. Display count of running containers in the RM's Web UI. (Ryu Kobayashi via ozawa) + YARN-3363. add localization and container launch time to ContainerMetrics + at NM to show these timing information for each active container. + (zxu via rkanter) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 68669aad61a..9997ca2ac88 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -96,6 +96,7 @@ public class ContainerImpl implements Container { private int exitCode = ContainerExitStatus.INVALID; private final StringBuilder diagnostics; private boolean wasLaunched; + private long containerLocalizationStartTime; private long containerLaunchStartTime; private static Clock clock = new SystemClock(); @@ -493,16 +494,21 @@ public class ContainerImpl implements Container { // resource usage. @SuppressWarnings("unchecked") // dispatcher not typed private void sendContainerMonitorStartEvent() { - long pmemBytes = getResource().getMemory() * 1024 * 1024L; - float pmemRatio = daemonConf.getFloat( - YarnConfiguration.NM_VMEM_PMEM_RATIO, - YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO); - long vmemBytes = (long) (pmemRatio * pmemBytes); - int cpuVcores = getResource().getVirtualCores(); + long launchDuration = clock.getTime() - containerLaunchStartTime; + metrics.addContainerLaunchDuration(launchDuration); - dispatcher.getEventHandler().handle( - new ContainerStartMonitoringEvent(containerId, - vmemBytes, pmemBytes, cpuVcores)); + long pmemBytes = getResource().getMemory() * 1024 * 1024L; + float pmemRatio = daemonConf.getFloat( + YarnConfiguration.NM_VMEM_PMEM_RATIO, + YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO); + long vmemBytes = (long) (pmemRatio * pmemBytes); + int cpuVcores = getResource().getVirtualCores(); + long localizationDuration = containerLaunchStartTime - + containerLocalizationStartTime; + dispatcher.getEventHandler().handle( + new ContainerStartMonitoringEvent(containerId, + vmemBytes, pmemBytes, cpuVcores, launchDuration, + localizationDuration)); } private void addDiagnostics(String... diags) { @@ -601,6 +607,7 @@ public class ContainerImpl implements Container { } } + container.containerLocalizationStartTime = clock.getTime(); // Send requests for public, private resources Map cntrRsrc = ctxt.getLocalResources(); if (!cntrRsrc.isEmpty()) { @@ -756,8 +763,6 @@ public class ContainerImpl implements Container { container.sendContainerMonitorStartEvent(); container.metrics.runningContainer(); container.wasLaunched = true; - long duration = clock.getTime() - container.containerLaunchStartTime; - container.metrics.addContainerLaunchDuration(duration); if (container.recoveredAsKilled) { LOG.info("Killing " + container.containerId diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java index ffa72a415d5..c364143241c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java @@ -28,6 +28,7 @@ import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableStat; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -46,6 +47,9 @@ public class ContainerMetrics implements MetricsSource { public static final String VMEM_LIMIT_METRIC_NAME = "vMemLimitMBs"; public static final String VCORE_LIMIT_METRIC_NAME = "vCoreLimit"; public static final String PMEM_USAGE_METRIC_NAME = "pMemUsageMBs"; + public static final String LAUNCH_DURATION_METRIC_NAME = "launchDurationMs"; + public static final String LOCALIZATION_DURATION_METRIC_NAME = + "localizationDurationMs"; private static final String PHY_CPU_USAGE_METRIC_NAME = "pCpuUsagePercent"; // Use a multiplier of 1000 to avoid losing too much precision when @@ -74,6 +78,12 @@ public class ContainerMetrics implements MetricsSource { @Metric public MutableGaugeInt cpuVcoreLimit; + @Metric + public MutableGaugeLong launchDurationMs; + + @Metric + public MutableGaugeLong localizationDurationMs; + static final MetricsInfo RECORD_INFO = info("ContainerResource", "Resource limit and usage by container"); @@ -122,6 +132,10 @@ public class ContainerMetrics implements MetricsSource { VMEM_LIMIT_METRIC_NAME, "Virtual memory limit in MBs", 0); this.cpuVcoreLimit = registry.newGauge( VCORE_LIMIT_METRIC_NAME, "CPU limit in number of vcores", 0); + this.launchDurationMs = registry.newGauge( + LAUNCH_DURATION_METRIC_NAME, "Launch duration in MS", 0L); + this.localizationDurationMs = registry.newGauge( + LOCALIZATION_DURATION_METRIC_NAME, "Localization duration in MS", 0L); } ContainerMetrics tag(MetricsInfo info, ContainerId containerId) { @@ -213,6 +227,12 @@ public class ContainerMetrics implements MetricsSource { this.cpuVcoreLimit.set(cpuVcores); } + public void recordStateChangeDurations(long launchDuration, + long localizationDuration) { + this.launchDurationMs.set(launchDuration); + this.localizationDurationMs.set(localizationDuration); + } + private synchronized void scheduleTimerTaskIfRequired() { if (flushPeriodMs > 0) { // Lazily initialize timer diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStartMonitoringEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStartMonitoringEvent.java index 56e2d8eed59..c09bebffa70 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStartMonitoringEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStartMonitoringEvent.java @@ -25,13 +25,18 @@ public class ContainerStartMonitoringEvent extends ContainersMonitorEvent { private final long vmemLimit; private final long pmemLimit; private final int cpuVcores; + private final long launchDuration; + private final long localizationDuration; public ContainerStartMonitoringEvent(ContainerId containerId, - long vmemLimit, long pmemLimit, int cpuVcores) { + long vmemLimit, long pmemLimit, int cpuVcores, long launchDuration, + long localizationDuration) { super(containerId, ContainersMonitorEventType.START_MONITORING_CONTAINER); this.vmemLimit = vmemLimit; this.pmemLimit = pmemLimit; this.cpuVcores = cpuVcores; + this.launchDuration = launchDuration; + this.localizationDuration = localizationDuration; } public long getVmemLimit() { @@ -45,4 +50,12 @@ public class ContainerStartMonitoringEvent extends ContainersMonitorEvent { public int getCpuVcores() { return this.cpuVcores; } + + public long getLaunchDuration() { + return this.launchDuration; + } + + public long getLocalizationDuration() { + return this.localizationDuration; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java index cce749e05ff..76bbda12f6a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java @@ -617,6 +617,15 @@ public class ContainersMonitorImpl extends AbstractService implements case START_MONITORING_CONTAINER: ContainerStartMonitoringEvent startEvent = (ContainerStartMonitoringEvent) monitoringEvent; + + if (containerMetricsEnabled) { + ContainerMetrics usageMetrics = ContainerMetrics + .forContainer(containerId, containerMetricsPeriodMs); + usageMetrics.recordStateChangeDurations( + startEvent.getLaunchDuration(), + startEvent.getLocalizationDuration()); + } + synchronized (this.containersToBeAdded) { ProcessTreeInfo processTreeInfo = new ProcessTreeInfo(containerId, null, null, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java index c6286486083..bdf99943568 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java @@ -87,10 +87,14 @@ public class TestContainerMetrics { int anyPmemLimit = 1024; int anyVmemLimit = 2048; int anyVcores = 10; + long anyLaunchDuration = 20L; + long anyLocalizationDuration = 1000L; String anyProcessId = "1234"; metrics.recordResourceLimit(anyVmemLimit, anyPmemLimit, anyVcores); metrics.recordProcessId(anyProcessId); + metrics.recordStateChangeDurations(anyLaunchDuration, + anyLocalizationDuration); Thread.sleep(110); metrics.getMetrics(collector, true); @@ -105,6 +109,12 @@ public class TestContainerMetrics { MetricsRecords.assertMetric(record, ContainerMetrics.VMEM_LIMIT_METRIC_NAME, anyVmemLimit); MetricsRecords.assertMetric(record, ContainerMetrics.VCORE_LIMIT_METRIC_NAME, anyVcores); + MetricsRecords.assertMetric(record, + ContainerMetrics.LAUNCH_DURATION_METRIC_NAME, anyLaunchDuration); + MetricsRecords.assertMetric(record, + ContainerMetrics.LOCALIZATION_DURATION_METRIC_NAME, + anyLocalizationDuration); + collector.clear(); } }