YARN-2352. FairScheduler: Collect metrics on duration of critical methods that affect performance. (kasha)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1616769 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d3a2fe2807
commit
14864e9c7c
|
@ -21,14 +21,18 @@ package org.apache.hadoop.metrics2.impl;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.metrics2.MetricsInfo;
|
import org.apache.hadoop.metrics2.MetricsInfo;
|
||||||
import org.apache.hadoop.metrics2.MetricsCollector;
|
import org.apache.hadoop.metrics2.MetricsCollector;
|
||||||
import org.apache.hadoop.metrics2.MetricsFilter;
|
import org.apache.hadoop.metrics2.MetricsFilter;
|
||||||
import static org.apache.hadoop.metrics2.lib.Interns.*;
|
import static org.apache.hadoop.metrics2.lib.Interns.*;
|
||||||
|
|
||||||
class MetricsCollectorImpl implements MetricsCollector,
|
@InterfaceAudience.Private
|
||||||
|
@VisibleForTesting
|
||||||
|
public class MetricsCollectorImpl implements MetricsCollector,
|
||||||
Iterable<MetricsRecordBuilderImpl> {
|
Iterable<MetricsRecordBuilderImpl> {
|
||||||
|
|
||||||
private final List<MetricsRecordBuilderImpl> rbs = Lists.newArrayList();
|
private final List<MetricsRecordBuilderImpl> rbs = Lists.newArrayList();
|
||||||
|
|
|
@ -89,6 +89,14 @@ public class MutableStat extends MutableMetric {
|
||||||
this(name, description, sampleName, valueName, false);
|
this(name, description, sampleName, valueName, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set whether to display the extended stats (stdev, min/max etc.) or not
|
||||||
|
* @param extended enable/disable displaying extended stats
|
||||||
|
*/
|
||||||
|
public synchronized void setExtended(boolean extended) {
|
||||||
|
this.extended = extended;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a number of samples and their sum to the running stat
|
* Add a number of samples and their sum to the running stat
|
||||||
* @param numSamples number of samples
|
* @param numSamples number of samples
|
||||||
|
|
|
@ -94,6 +94,9 @@ Release 2.6.0 - UNRELEASED
|
||||||
YARN-2288. Made persisted data in LevelDB timeline store be versioned. (Junping Du
|
YARN-2288. Made persisted data in LevelDB timeline store be versioned. (Junping Du
|
||||||
via zjshen)
|
via zjshen)
|
||||||
|
|
||||||
|
YARN-2352. FairScheduler: Collect metrics on duration of critical methods that
|
||||||
|
affect performance. (kasha)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
|
@ -200,6 +200,13 @@
|
||||||
<Field name="updateInterval" />
|
<Field name="updateInterval" />
|
||||||
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
||||||
</Match>
|
</Match>
|
||||||
|
<!-- Inconsistent sync warning - callDurationMetrics is only initialized once and never changed -->
|
||||||
|
<Match>
|
||||||
|
<Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler" />
|
||||||
|
<Field name="fsOpDurations" />
|
||||||
|
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
||||||
|
</Match>
|
||||||
|
|
||||||
<!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
|
<!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
|
||||||
<Match>
|
<Match>
|
||||||
<Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />
|
<Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />
|
||||||
|
|
|
@ -149,6 +149,7 @@ public class FairScheduler extends
|
||||||
|
|
||||||
// Aggregate metrics
|
// Aggregate metrics
|
||||||
FSQueueMetrics rootMetrics;
|
FSQueueMetrics rootMetrics;
|
||||||
|
FSOpDurations fsOpDurations;
|
||||||
|
|
||||||
// Time when we last updated preemption vars
|
// Time when we last updated preemption vars
|
||||||
protected long lastPreemptionUpdateTime;
|
protected long lastPreemptionUpdateTime;
|
||||||
|
@ -256,8 +257,11 @@ public class FairScheduler extends
|
||||||
while (!Thread.currentThread().isInterrupted()) {
|
while (!Thread.currentThread().isInterrupted()) {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(updateInterval);
|
Thread.sleep(updateInterval);
|
||||||
|
long start = getClock().getTime();
|
||||||
update();
|
update();
|
||||||
preemptTasksIfNecessary();
|
preemptTasksIfNecessary();
|
||||||
|
long duration = getClock().getTime() - start;
|
||||||
|
fsOpDurations.addUpdateThreadRunDuration(duration);
|
||||||
} catch (InterruptedException ie) {
|
} catch (InterruptedException ie) {
|
||||||
LOG.warn("Update thread interrupted. Exiting.");
|
LOG.warn("Update thread interrupted. Exiting.");
|
||||||
return;
|
return;
|
||||||
|
@ -294,6 +298,7 @@ public class FairScheduler extends
|
||||||
* required resources per job.
|
* required resources per job.
|
||||||
*/
|
*/
|
||||||
protected synchronized void update() {
|
protected synchronized void update() {
|
||||||
|
long start = getClock().getTime();
|
||||||
updatePreemptionVariables(); // Determine if any queues merit preemption
|
updatePreemptionVariables(); // Determine if any queues merit preemption
|
||||||
|
|
||||||
FSQueue rootQueue = queueMgr.getRootQueue();
|
FSQueue rootQueue = queueMgr.getRootQueue();
|
||||||
|
@ -317,6 +322,9 @@ public class FairScheduler extends
|
||||||
" Demand: " + rootQueue.getDemand());
|
" Demand: " + rootQueue.getDemand());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long duration = getClock().getTime() - start;
|
||||||
|
fsOpDurations.addUpdateCallDuration(duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -325,7 +333,7 @@ public class FairScheduler extends
|
||||||
* for each type of task.
|
* for each type of task.
|
||||||
*/
|
*/
|
||||||
private void updatePreemptionVariables() {
|
private void updatePreemptionVariables() {
|
||||||
long now = clock.getTime();
|
long now = getClock().getTime();
|
||||||
lastPreemptionUpdateTime = now;
|
lastPreemptionUpdateTime = now;
|
||||||
for (FSLeafQueue sched : queueMgr.getLeafQueues()) {
|
for (FSLeafQueue sched : queueMgr.getLeafQueues()) {
|
||||||
if (!isStarvedForMinShare(sched)) {
|
if (!isStarvedForMinShare(sched)) {
|
||||||
|
@ -352,7 +360,8 @@ public class FairScheduler extends
|
||||||
* defined as being below half its fair share.
|
* defined as being below half its fair share.
|
||||||
*/
|
*/
|
||||||
boolean isStarvedForFairShare(FSLeafQueue sched) {
|
boolean isStarvedForFairShare(FSLeafQueue sched) {
|
||||||
Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR, clusterResource,
|
Resource desiredFairShare = Resources.min(RESOURCE_CALCULATOR,
|
||||||
|
clusterResource,
|
||||||
Resources.multiply(sched.getFairShare(), .5), sched.getDemand());
|
Resources.multiply(sched.getFairShare(), .5), sched.getDemand());
|
||||||
return Resources.lessThan(RESOURCE_CALCULATOR, clusterResource,
|
return Resources.lessThan(RESOURCE_CALCULATOR, clusterResource,
|
||||||
sched.getResourceUsage(), desiredFairShare);
|
sched.getResourceUsage(), desiredFairShare);
|
||||||
|
@ -370,7 +379,7 @@ public class FairScheduler extends
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
long curTime = clock.getTime();
|
long curTime = getClock().getTime();
|
||||||
if (curTime - lastPreemptCheckTime < preemptionInterval) {
|
if (curTime - lastPreemptCheckTime < preemptionInterval) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -398,6 +407,7 @@ public class FairScheduler extends
|
||||||
* We make sure that no queue is placed below its fair share in the process.
|
* We make sure that no queue is placed below its fair share in the process.
|
||||||
*/
|
*/
|
||||||
protected void preemptResources(Resource toPreempt) {
|
protected void preemptResources(Resource toPreempt) {
|
||||||
|
long start = getClock().getTime();
|
||||||
if (Resources.equals(toPreempt, Resources.none())) {
|
if (Resources.equals(toPreempt, Resources.none())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -448,6 +458,9 @@ public class FairScheduler extends
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long duration = getClock().getTime() - start;
|
||||||
|
fsOpDurations.addPreemptCallDuration(duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void warnOrKillContainer(RMContainer container) {
|
protected void warnOrKillContainer(RMContainer container) {
|
||||||
|
@ -463,7 +476,7 @@ public class FairScheduler extends
|
||||||
if (time != null) {
|
if (time != null) {
|
||||||
// if we asked for preemption more than maxWaitTimeBeforeKill ms ago,
|
// if we asked for preemption more than maxWaitTimeBeforeKill ms ago,
|
||||||
// proceed with kill
|
// proceed with kill
|
||||||
if (time + waitTimeBeforeKill < clock.getTime()) {
|
if (time + waitTimeBeforeKill < getClock().getTime()) {
|
||||||
ContainerStatus status =
|
ContainerStatus status =
|
||||||
SchedulerUtils.createPreemptedContainerStatus(
|
SchedulerUtils.createPreemptedContainerStatus(
|
||||||
container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER);
|
container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER);
|
||||||
|
@ -474,11 +487,11 @@ public class FairScheduler extends
|
||||||
completedContainer(container, status, RMContainerEventType.KILL);
|
completedContainer(container, status, RMContainerEventType.KILL);
|
||||||
LOG.info("Killing container" + container +
|
LOG.info("Killing container" + container +
|
||||||
" (after waiting for premption for " +
|
" (after waiting for premption for " +
|
||||||
(clock.getTime() - time) + "ms)");
|
(getClock().getTime() - time) + "ms)");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// track the request in the FSSchedulerApp itself
|
// track the request in the FSSchedulerApp itself
|
||||||
app.addPreemption(container, clock.getTime());
|
app.addPreemption(container, getClock().getTime());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -659,7 +672,7 @@ public class FairScheduler extends
|
||||||
rmContext);
|
rmContext);
|
||||||
if (transferStateFromPreviousAttempt) {
|
if (transferStateFromPreviousAttempt) {
|
||||||
attempt.transferStateFromPreviousAttempt(application
|
attempt.transferStateFromPreviousAttempt(application
|
||||||
.getCurrentAppAttempt());
|
.getCurrentAppAttempt());
|
||||||
}
|
}
|
||||||
application.setCurrentAppAttempt(attempt);
|
application.setCurrentAppAttempt(attempt);
|
||||||
|
|
||||||
|
@ -960,6 +973,7 @@ public class FairScheduler extends
|
||||||
* Process a heartbeat update from a node.
|
* Process a heartbeat update from a node.
|
||||||
*/
|
*/
|
||||||
private synchronized void nodeUpdate(RMNode nm) {
|
private synchronized void nodeUpdate(RMNode nm) {
|
||||||
|
long start = getClock().getTime();
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
|
LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
|
||||||
}
|
}
|
||||||
|
@ -996,9 +1010,13 @@ public class FairScheduler extends
|
||||||
} else {
|
} else {
|
||||||
attemptScheduling(node);
|
attemptScheduling(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long duration = getClock().getTime() - start;
|
||||||
|
fsOpDurations.addNodeUpdateDuration(duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
void continuousSchedulingAttempt() throws InterruptedException {
|
void continuousSchedulingAttempt() throws InterruptedException {
|
||||||
|
long start = getClock().getTime();
|
||||||
List<NodeId> nodeIdList = new ArrayList<NodeId>(nodes.keySet());
|
List<NodeId> nodeIdList = new ArrayList<NodeId>(nodes.keySet());
|
||||||
// Sort the nodes by space available on them, so that we offer
|
// Sort the nodes by space available on them, so that we offer
|
||||||
// containers on emptier nodes first, facilitating an even spread. This
|
// containers on emptier nodes first, facilitating an even spread. This
|
||||||
|
@ -1021,6 +1039,9 @@ public class FairScheduler extends
|
||||||
": " + ex.toString(), ex);
|
": " + ex.toString(), ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long duration = getClock().getTime() - start;
|
||||||
|
fsOpDurations.addContinuousSchedulingRunDuration(duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Sort nodes by available resource */
|
/** Sort nodes by available resource */
|
||||||
|
@ -1244,6 +1265,8 @@ public class FairScheduler extends
|
||||||
}
|
}
|
||||||
|
|
||||||
rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
|
rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
|
||||||
|
fsOpDurations = FSOpDurations.getInstance(true);
|
||||||
|
|
||||||
// This stores per-application scheduling information
|
// This stores per-application scheduling information
|
||||||
this.applications =
|
this.applications =
|
||||||
new ConcurrentHashMap<ApplicationId,SchedulerApplication<FSSchedulerApp>>();
|
new ConcurrentHashMap<ApplicationId,SchedulerApplication<FSSchedulerApp>>();
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
||||||
|
|
||||||
|
import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertNotEquals;
|
import static org.junit.Assert.assertNotEquals;
|
||||||
|
@ -3366,4 +3367,14 @@ public class TestFairScheduler extends FairSchedulerTestBase {
|
||||||
|
|
||||||
assertNotEquals("One of the threads is still alive", 0, numRetries);
|
assertNotEquals("One of the threads is still alive", 0, numRetries);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPerfMetricsInited() {
|
||||||
|
scheduler.init(conf);
|
||||||
|
scheduler.start();
|
||||||
|
MetricsCollectorImpl collector = new MetricsCollectorImpl();
|
||||||
|
scheduler.fsOpDurations.getMetrics(collector, true);
|
||||||
|
assertEquals("Incorrect number of perf metrics", 1,
|
||||||
|
collector.getRecords().size());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue