From 14864e9c7c879c15b5fa2d1776614ec83152918f Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Fri, 8 Aug 2014 14:17:54 +0000 Subject: [PATCH] YARN-2352. FairScheduler: Collect metrics on duration of critical methods that affect performance. (kasha) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1616769 13f79535-47bb-0310-9956-ffa450edef68 --- .../metrics2/impl/MetricsCollectorImpl.java | 6 ++- .../hadoop/metrics2/lib/MutableStat.java | 8 ++++ hadoop-yarn-project/CHANGES.txt | 3 ++ .../dev-support/findbugs-exclude.xml | 7 ++++ .../scheduler/fair/FairScheduler.java | 37 +++++++++++++++---- .../scheduler/fair/TestFairScheduler.java | 11 ++++++ 6 files changed, 64 insertions(+), 8 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsCollectorImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsCollectorImpl.java index 5f536298021..be442edb1b8 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsCollectorImpl.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsCollectorImpl.java @@ -21,14 +21,18 @@ package org.apache.hadoop.metrics2.impl; import java.util.Iterator; import java.util.List; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; +import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.metrics2.MetricsInfo; import org.apache.hadoop.metrics2.MetricsCollector; import org.apache.hadoop.metrics2.MetricsFilter; import static org.apache.hadoop.metrics2.lib.Interns.*; -class MetricsCollectorImpl implements MetricsCollector, +@InterfaceAudience.Private +@VisibleForTesting +public class MetricsCollectorImpl implements MetricsCollector, Iterable { private final List rbs = Lists.newArrayList(); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableStat.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableStat.java index b8ba435bf45..ba377570efb 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableStat.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableStat.java @@ -89,6 +89,14 @@ public class MutableStat extends MutableMetric { this(name, description, sampleName, valueName, false); } + /** + * Set whether to display the extended stats (stdev, min/max etc.) or not + * @param extended enable/disable displaying extended stats + */ + public synchronized void setExtended(boolean extended) { + this.extended = extended; + } + /** * Add a number of samples and their sum to the running stat * @param numSamples number of samples diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index a5b77545d8a..eb9079d9f60 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -94,6 +94,9 @@ Release 2.6.0 - UNRELEASED YARN-2288. Made persisted data in LevelDB timeline store be versioned. (Junping Du via zjshen) + YARN-2352. FairScheduler: Collect metrics on duration of critical methods that + affect performance. (kasha) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index 781d7a35922..6609a260130 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -200,6 +200,13 @@ + + + + + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 4e1c244730a..8765ba04dc7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -149,6 +149,7 @@ public class FairScheduler extends // Aggregate metrics FSQueueMetrics rootMetrics; + FSOpDurations fsOpDurations; // Time when we last updated preemption vars protected long lastPreemptionUpdateTime; @@ -256,8 +257,11 @@ public class FairScheduler extends while (!Thread.currentThread().isInterrupted()) { try { Thread.sleep(updateInterval); + long start = getClock().getTime(); update(); preemptTasksIfNecessary(); + long duration = getClock().getTime() - start; + fsOpDurations.addUpdateThreadRunDuration(duration); } catch (InterruptedException ie) { LOG.warn("Update thread interrupted. Exiting."); return; @@ -294,6 +298,7 @@ public class FairScheduler extends * required resources per job. */ protected synchronized void update() { + long start = getClock().getTime(); updatePreemptionVariables(); // Determine if any queues merit preemption FSQueue rootQueue = queueMgr.getRootQueue(); @@ -317,6 +322,9 @@ public class FairScheduler extends " Demand: " + rootQueue.getDemand()); } } + + long duration = getClock().getTime() - start; + fsOpDurations.addUpdateCallDuration(duration); } /** @@ -325,7 +333,7 @@ public class FairScheduler extends * for each type of task. */ private void updatePreemptionVariables() { - long now = clock.getTime(); + long now = getClock().getTime(); lastPreemptionUpdateTime = now; for (FSLeafQueue sched : queueMgr.getLeafQueues()) { if (!isStarvedForMinShare(sched)) { @@ -352,7 +360,8 @@ public class FairScheduler extends * defined as being below half its fair share. */ boolean isStarvedForFairShare(FSLeafQueue sched) { - Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR, clusterResource, + Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR, + clusterResource, Resources.multiply(sched.getFairShare(), .5), sched.getDemand()); return Resources.lessThan(RESOURCE_CALCULATOR, clusterResource, sched.getResourceUsage(), desiredFairShare); @@ -370,7 +379,7 @@ public class FairScheduler extends return; } - long curTime = clock.getTime(); + long curTime = getClock().getTime(); if (curTime - lastPreemptCheckTime < preemptionInterval) { return; } @@ -398,6 +407,7 @@ public class FairScheduler extends * We make sure that no queue is placed below its fair share in the process. */ protected void preemptResources(Resource toPreempt) { + long start = getClock().getTime(); if (Resources.equals(toPreempt, Resources.none())) { return; } @@ -448,6 +458,9 @@ public class FairScheduler extends } } } + + long duration = getClock().getTime() - start; + fsOpDurations.addPreemptCallDuration(duration); } protected void warnOrKillContainer(RMContainer container) { @@ -463,7 +476,7 @@ public class FairScheduler extends if (time != null) { // if we asked for preemption more than maxWaitTimeBeforeKill ms ago, // proceed with kill - if (time + waitTimeBeforeKill < clock.getTime()) { + if (time + waitTimeBeforeKill < getClock().getTime()) { ContainerStatus status = SchedulerUtils.createPreemptedContainerStatus( container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER); @@ -474,11 +487,11 @@ public class FairScheduler extends completedContainer(container, status, RMContainerEventType.KILL); LOG.info("Killing container" + container + " (after waiting for premption for " + - (clock.getTime() - time) + "ms)"); + (getClock().getTime() - time) + "ms)"); } } else { // track the request in the FSSchedulerApp itself - app.addPreemption(container, clock.getTime()); + app.addPreemption(container, getClock().getTime()); } } @@ -659,7 +672,7 @@ public class FairScheduler extends rmContext); if (transferStateFromPreviousAttempt) { attempt.transferStateFromPreviousAttempt(application - .getCurrentAppAttempt()); + .getCurrentAppAttempt()); } application.setCurrentAppAttempt(attempt); @@ -960,6 +973,7 @@ public class FairScheduler extends * Process a heartbeat update from a node. */ private synchronized void nodeUpdate(RMNode nm) { + long start = getClock().getTime(); if (LOG.isDebugEnabled()) { LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource); } @@ -996,9 +1010,13 @@ public class FairScheduler extends } else { attemptScheduling(node); } + + long duration = getClock().getTime() - start; + fsOpDurations.addNodeUpdateDuration(duration); } void continuousSchedulingAttempt() throws InterruptedException { + long start = getClock().getTime(); List nodeIdList = new ArrayList(nodes.keySet()); // Sort the nodes by space available on them, so that we offer // containers on emptier nodes first, facilitating an even spread. This @@ -1021,6 +1039,9 @@ public class FairScheduler extends ": " + ex.toString(), ex); } } + + long duration = getClock().getTime() - start; + fsOpDurations.addContinuousSchedulingRunDuration(duration); } /** Sort nodes by available resource */ @@ -1244,6 +1265,8 @@ public class FairScheduler extends } rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf); + fsOpDurations = FSOpDurations.getInstance(true); + // This stores per-application scheduling information this.applications = new ConcurrentHashMap>(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index 23c928c8f37..f3289ccea1c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair; +import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotEquals; @@ -3366,4 +3367,14 @@ public class TestFairScheduler extends FairSchedulerTestBase { assertNotEquals("One of the threads is still alive", 0, numRetries); } + + @Test + public void testPerfMetricsInited() { + scheduler.init(conf); + scheduler.start(); + MetricsCollectorImpl collector = new MetricsCollectorImpl(); + scheduler.fsOpDurations.getMetrics(collector, true); + assertEquals("Incorrect number of perf metrics", 1, + collector.getRecords().size()); + } }