YARN-2352. FairScheduler: Collect metrics on duration of critical methods that affect performance. (kasha)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1616769 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Karthik Kambatla 2014-08-08 14:17:54 +00:00
parent d3a2fe2807
commit 14864e9c7c
6 changed files with 64 additions and 8 deletions

View File

@ -21,14 +21,18 @@ package org.apache.hadoop.metrics2.impl;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.metrics2.MetricsInfo; import org.apache.hadoop.metrics2.MetricsInfo;
import org.apache.hadoop.metrics2.MetricsCollector; import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsFilter; import org.apache.hadoop.metrics2.MetricsFilter;
import static org.apache.hadoop.metrics2.lib.Interns.*; import static org.apache.hadoop.metrics2.lib.Interns.*;
class MetricsCollectorImpl implements MetricsCollector, @InterfaceAudience.Private
@VisibleForTesting
public class MetricsCollectorImpl implements MetricsCollector,
Iterable<MetricsRecordBuilderImpl> { Iterable<MetricsRecordBuilderImpl> {
private final List<MetricsRecordBuilderImpl> rbs = Lists.newArrayList(); private final List<MetricsRecordBuilderImpl> rbs = Lists.newArrayList();

View File

@ -89,6 +89,14 @@ public class MutableStat extends MutableMetric {
this(name, description, sampleName, valueName, false); this(name, description, sampleName, valueName, false);
} }
/**
* Set whether to display the extended stats (stdev, min/max etc.) or not
* @param extended enable/disable displaying extended stats
*/
public synchronized void setExtended(boolean extended) {
this.extended = extended;
}
/** /**
* Add a number of samples and their sum to the running stat * Add a number of samples and their sum to the running stat
* @param numSamples number of samples * @param numSamples number of samples

View File

@ -94,6 +94,9 @@ Release 2.6.0 - UNRELEASED
YARN-2288. Made persisted data in LevelDB timeline store be versioned. (Junping Du YARN-2288. Made persisted data in LevelDB timeline store be versioned. (Junping Du
via zjshen) via zjshen)
YARN-2352. FairScheduler: Collect metrics on duration of critical methods that
affect performance. (kasha)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES

View File

@ -200,6 +200,13 @@
<Field name="updateInterval" /> <Field name="updateInterval" />
<Bug pattern="IS2_INCONSISTENT_SYNC" /> <Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match> </Match>
<!-- Inconsistent sync warning - callDurationMetrics is only initialized once and never changed -->
<Match>
<Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler" />
<Field name="fsOpDurations" />
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match>
<!-- Inconsistent sync warning - numRetries is only initialized once and never changed --> <!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
<Match> <Match>
<Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" /> <Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />

View File

@ -149,6 +149,7 @@ public class FairScheduler extends
// Aggregate metrics // Aggregate metrics
FSQueueMetrics rootMetrics; FSQueueMetrics rootMetrics;
FSOpDurations fsOpDurations;
// Time when we last updated preemption vars // Time when we last updated preemption vars
protected long lastPreemptionUpdateTime; protected long lastPreemptionUpdateTime;
@ -256,8 +257,11 @@ public class FairScheduler extends
while (!Thread.currentThread().isInterrupted()) { while (!Thread.currentThread().isInterrupted()) {
try { try {
Thread.sleep(updateInterval); Thread.sleep(updateInterval);
long start = getClock().getTime();
update(); update();
preemptTasksIfNecessary(); preemptTasksIfNecessary();
long duration = getClock().getTime() - start;
fsOpDurations.addUpdateThreadRunDuration(duration);
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
LOG.warn("Update thread interrupted. Exiting."); LOG.warn("Update thread interrupted. Exiting.");
return; return;
@ -294,6 +298,7 @@ public class FairScheduler extends
* required resources per job. * required resources per job.
*/ */
protected synchronized void update() { protected synchronized void update() {
long start = getClock().getTime();
updatePreemptionVariables(); // Determine if any queues merit preemption updatePreemptionVariables(); // Determine if any queues merit preemption
FSQueue rootQueue = queueMgr.getRootQueue(); FSQueue rootQueue = queueMgr.getRootQueue();
@ -317,6 +322,9 @@ public class FairScheduler extends
" Demand: " + rootQueue.getDemand()); " Demand: " + rootQueue.getDemand());
} }
} }
long duration = getClock().getTime() - start;
fsOpDurations.addUpdateCallDuration(duration);
} }
/** /**
@ -325,7 +333,7 @@ public class FairScheduler extends
* for each type of task. * for each type of task.
*/ */
private void updatePreemptionVariables() { private void updatePreemptionVariables() {
long now = clock.getTime(); long now = getClock().getTime();
lastPreemptionUpdateTime = now; lastPreemptionUpdateTime = now;
for (FSLeafQueue sched : queueMgr.getLeafQueues()) { for (FSLeafQueue sched : queueMgr.getLeafQueues()) {
if (!isStarvedForMinShare(sched)) { if (!isStarvedForMinShare(sched)) {
@ -352,7 +360,8 @@ public class FairScheduler extends
* defined as being below half its fair share. * defined as being below half its fair share.
*/ */
boolean isStarvedForFairShare(FSLeafQueue sched) { boolean isStarvedForFairShare(FSLeafQueue sched) {
Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR, clusterResource, Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR,
clusterResource,
Resources.multiply(sched.getFairShare(), .5), sched.getDemand()); Resources.multiply(sched.getFairShare(), .5), sched.getDemand());
return Resources.lessThan(RESOURCE_CALCULATOR, clusterResource, return Resources.lessThan(RESOURCE_CALCULATOR, clusterResource,
sched.getResourceUsage(), desiredFairShare); sched.getResourceUsage(), desiredFairShare);
@ -370,7 +379,7 @@ public class FairScheduler extends
return; return;
} }
long curTime = clock.getTime(); long curTime = getClock().getTime();
if (curTime - lastPreemptCheckTime < preemptionInterval) { if (curTime - lastPreemptCheckTime < preemptionInterval) {
return; return;
} }
@ -398,6 +407,7 @@ public class FairScheduler extends
* We make sure that no queue is placed below its fair share in the process. * We make sure that no queue is placed below its fair share in the process.
*/ */
protected void preemptResources(Resource toPreempt) { protected void preemptResources(Resource toPreempt) {
long start = getClock().getTime();
if (Resources.equals(toPreempt, Resources.none())) { if (Resources.equals(toPreempt, Resources.none())) {
return; return;
} }
@ -448,6 +458,9 @@ public class FairScheduler extends
} }
} }
} }
long duration = getClock().getTime() - start;
fsOpDurations.addPreemptCallDuration(duration);
} }
protected void warnOrKillContainer(RMContainer container) { protected void warnOrKillContainer(RMContainer container) {
@ -463,7 +476,7 @@ public class FairScheduler extends
if (time != null) { if (time != null) {
// if we asked for preemption more than maxWaitTimeBeforeKill ms ago, // if we asked for preemption more than maxWaitTimeBeforeKill ms ago,
// proceed with kill // proceed with kill
if (time + waitTimeBeforeKill < clock.getTime()) { if (time + waitTimeBeforeKill < getClock().getTime()) {
ContainerStatus status = ContainerStatus status =
SchedulerUtils.createPreemptedContainerStatus( SchedulerUtils.createPreemptedContainerStatus(
container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER); container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER);
@ -474,11 +487,11 @@ public class FairScheduler extends
completedContainer(container, status, RMContainerEventType.KILL); completedContainer(container, status, RMContainerEventType.KILL);
LOG.info("Killing container" + container + LOG.info("Killing container" + container +
" (after waiting for premption for " + " (after waiting for premption for " +
(clock.getTime() - time) + "ms)"); (getClock().getTime() - time) + "ms)");
} }
} else { } else {
// track the request in the FSSchedulerApp itself // track the request in the FSSchedulerApp itself
app.addPreemption(container, clock.getTime()); app.addPreemption(container, getClock().getTime());
} }
} }
@ -659,7 +672,7 @@ public class FairScheduler extends
rmContext); rmContext);
if (transferStateFromPreviousAttempt) { if (transferStateFromPreviousAttempt) {
attempt.transferStateFromPreviousAttempt(application attempt.transferStateFromPreviousAttempt(application
.getCurrentAppAttempt()); .getCurrentAppAttempt());
} }
application.setCurrentAppAttempt(attempt); application.setCurrentAppAttempt(attempt);
@ -960,6 +973,7 @@ public class FairScheduler extends
* Process a heartbeat update from a node. * Process a heartbeat update from a node.
*/ */
private synchronized void nodeUpdate(RMNode nm) { private synchronized void nodeUpdate(RMNode nm) {
long start = getClock().getTime();
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource); LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
} }
@ -996,9 +1010,13 @@ public class FairScheduler extends
} else { } else {
attemptScheduling(node); attemptScheduling(node);
} }
long duration = getClock().getTime() - start;
fsOpDurations.addNodeUpdateDuration(duration);
} }
void continuousSchedulingAttempt() throws InterruptedException { void continuousSchedulingAttempt() throws InterruptedException {
long start = getClock().getTime();
List<NodeId> nodeIdList = new ArrayList<NodeId>(nodes.keySet()); List<NodeId> nodeIdList = new ArrayList<NodeId>(nodes.keySet());
// Sort the nodes by space available on them, so that we offer // Sort the nodes by space available on them, so that we offer
// containers on emptier nodes first, facilitating an even spread. This // containers on emptier nodes first, facilitating an even spread. This
@ -1021,6 +1039,9 @@ public class FairScheduler extends
": " + ex.toString(), ex); ": " + ex.toString(), ex);
} }
} }
long duration = getClock().getTime() - start;
fsOpDurations.addContinuousSchedulingRunDuration(duration);
} }
/** Sort nodes by available resource */ /** Sort nodes by available resource */
@ -1244,6 +1265,8 @@ public class FairScheduler extends
} }
rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf); rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
fsOpDurations = FSOpDurations.getInstance(true);
// This stores per-application scheduling information // This stores per-application scheduling information
this.applications = this.applications =
new ConcurrentHashMap<ApplicationId,SchedulerApplication<FSSchedulerApp>>(); new ConcurrentHashMap<ApplicationId,SchedulerApplication<FSSchedulerApp>>();

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair; package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotEquals;
@ -3366,4 +3367,14 @@ public class TestFairScheduler extends FairSchedulerTestBase {
assertNotEquals("One of the threads is still alive", 0, numRetries); assertNotEquals("One of the threads is still alive", 0, numRetries);
} }
@Test
public void testPerfMetricsInited() {
scheduler.init(conf);
scheduler.start();
MetricsCollectorImpl collector = new MetricsCollectorImpl();
scheduler.fsOpDurations.getMetrics(collector, true);
assertEquals("Incorrect number of perf metrics", 1,
collector.getRecords().size());
}
} }