YARN-3259. FairScheduler: Trigger fairShare updates on node events. (Anubhav Dhoot via kasha)
This commit is contained in:
parent
6786daab33
commit
75885852cc
|
@ -306,6 +306,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
YARN-3547. FairScheduler: Apps that have no resource demand should not participate
|
YARN-3547. FairScheduler: Apps that have no resource demand should not participate
|
||||||
scheduling. (Xianyin Xin via kasha)
|
scheduling. (Xianyin Xin via kasha)
|
||||||
|
|
||||||
|
YARN-3259. FairScheduler: Trigger fairShare updates on node events.
|
||||||
|
(Anubhav Dhoot via kasha)
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
||||||
YARN-3197. Confusing log generated by CapacityScheduler. (Varun Saxena
|
YARN-3197. Confusing log generated by CapacityScheduler. (Varun Saxena
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.classification.InterfaceStability;
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
import org.apache.hadoop.metrics2.MetricsCollector;
|
import org.apache.hadoop.metrics2.MetricsCollector;
|
||||||
|
@ -116,4 +117,9 @@ public class FSOpDurations implements MetricsSource {
|
||||||
public void addPreemptCallDuration(long value) {
|
public void addPreemptCallDuration(long value) {
|
||||||
preemptCall.add(value);
|
preemptCall.add(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public boolean hasUpdateThreadRunChanged() {
|
||||||
|
return updateThreadRun.changed();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -103,8 +103,8 @@ import com.google.common.base.Preconditions;
|
||||||
* of the root queue in the typical fair scheduling fashion. Then, the children
|
* of the root queue in the typical fair scheduling fashion. Then, the children
|
||||||
* distribute the resources assigned to them to their children in the same
|
* distribute the resources assigned to them to their children in the same
|
||||||
* fashion. Applications may only be scheduled on leaf queues. Queues can be
|
* fashion. Applications may only be scheduled on leaf queues. Queues can be
|
||||||
* specified as children of other queues by placing them as sub-elements of their
|
* specified as children of other queues by placing them as sub-elements of
|
||||||
* parents in the fair scheduler configuration file.
|
* their parents in the fair scheduler configuration file.
|
||||||
*
|
*
|
||||||
* A queue's name starts with the names of its parents, with periods as
|
* A queue's name starts with the names of its parents, with periods as
|
||||||
* separators. So a queue named "queue1" under the root named, would be
|
* separators. So a queue named "queue1" under the root named, would be
|
||||||
|
@ -142,6 +142,8 @@ public class FairScheduler extends
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
Thread updateThread;
|
Thread updateThread;
|
||||||
|
|
||||||
|
private final Object updateThreadMonitor = new Object();
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
Thread schedulingThread;
|
Thread schedulingThread;
|
||||||
// timeout to join when we stop this service
|
// timeout to join when we stop this service
|
||||||
|
@ -246,6 +248,13 @@ public class FairScheduler extends
|
||||||
return queueMgr;
|
return queueMgr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Allows UpdateThread to start processing without waiting till updateInterval
|
||||||
|
void triggerUpdate() {
|
||||||
|
synchronized (updateThreadMonitor) {
|
||||||
|
updateThreadMonitor.notify();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Thread which calls {@link FairScheduler#update()} every
|
* Thread which calls {@link FairScheduler#update()} every
|
||||||
* <code>updateInterval</code> milliseconds.
|
* <code>updateInterval</code> milliseconds.
|
||||||
|
@ -256,7 +265,9 @@ public class FairScheduler extends
|
||||||
public void run() {
|
public void run() {
|
||||||
while (!Thread.currentThread().isInterrupted()) {
|
while (!Thread.currentThread().isInterrupted()) {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(updateInterval);
|
synchronized (updateThreadMonitor) {
|
||||||
|
updateThreadMonitor.wait(updateInterval);
|
||||||
|
}
|
||||||
long start = getClock().getTime();
|
long start = getClock().getTime();
|
||||||
update();
|
update();
|
||||||
preemptTasksIfNecessary();
|
preemptTasksIfNecessary();
|
||||||
|
@ -838,6 +849,8 @@ public class FairScheduler extends
|
||||||
updateRootQueueMetrics();
|
updateRootQueueMetrics();
|
||||||
updateMaximumAllocation(schedulerNode, true);
|
updateMaximumAllocation(schedulerNode, true);
|
||||||
|
|
||||||
|
triggerUpdate();
|
||||||
|
|
||||||
queueMgr.getRootQueue().setSteadyFairShare(clusterResource);
|
queueMgr.getRootQueue().setSteadyFairShare(clusterResource);
|
||||||
queueMgr.getRootQueue().recomputeSteadyShares();
|
queueMgr.getRootQueue().recomputeSteadyShares();
|
||||||
LOG.info("Added node " + node.getNodeAddress() +
|
LOG.info("Added node " + node.getNodeAddress() +
|
||||||
|
@ -853,6 +866,8 @@ public class FairScheduler extends
|
||||||
Resources.subtractFrom(clusterResource, rmNode.getTotalCapability());
|
Resources.subtractFrom(clusterResource, rmNode.getTotalCapability());
|
||||||
updateRootQueueMetrics();
|
updateRootQueueMetrics();
|
||||||
|
|
||||||
|
triggerUpdate();
|
||||||
|
|
||||||
// Remove running containers
|
// Remove running containers
|
||||||
List<RMContainer> runningContainers = node.getRunningContainers();
|
List<RMContainer> runningContainers = node.getRunningContainers();
|
||||||
for (RMContainer container : runningContainers) {
|
for (RMContainer container : runningContainers) {
|
||||||
|
|
|
@ -0,0 +1,135 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.metrics2.AbstractMetric;
|
||||||
|
import org.apache.hadoop.metrics2.MetricsRecord;
|
||||||
|
import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.MockNodes;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
public class TestSchedulingUpdate extends FairSchedulerTestBase {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Configuration createConfiguration() {
|
||||||
|
Configuration conf = super.createConfiguration();
|
||||||
|
|
||||||
|
// Make the update loop to never finish to ensure zero update calls
|
||||||
|
conf.setInt(
|
||||||
|
FairSchedulerConfiguration.UPDATE_INTERVAL_MS,
|
||||||
|
Integer.MAX_VALUE);
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
conf = createConfiguration();
|
||||||
|
resourceManager = new MockRM(conf);
|
||||||
|
resourceManager.start();
|
||||||
|
|
||||||
|
scheduler = (FairScheduler) resourceManager.getResourceScheduler();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void teardown() {
|
||||||
|
if (resourceManager != null) {
|
||||||
|
resourceManager.stop();
|
||||||
|
resourceManager = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test (timeout = 3000)
|
||||||
|
public void testSchedulingUpdateOnNodeJoinLeave() throws InterruptedException {
|
||||||
|
|
||||||
|
verifyNoCalls();
|
||||||
|
|
||||||
|
// Add one node
|
||||||
|
String host = "127.0.0.1";
|
||||||
|
final int memory = 4096;
|
||||||
|
final int cores = 4;
|
||||||
|
RMNode node1 = MockNodes.newNodeInfo(
|
||||||
|
1, Resources.createResource(memory, cores), 1, host);
|
||||||
|
NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
|
||||||
|
scheduler.handle(nodeEvent1);
|
||||||
|
|
||||||
|
long expectedCalls = 1;
|
||||||
|
verifyExpectedCalls(expectedCalls, memory, cores);
|
||||||
|
|
||||||
|
// Remove the node
|
||||||
|
NodeRemovedSchedulerEvent nodeEvent2 = new NodeRemovedSchedulerEvent(node1);
|
||||||
|
scheduler.handle(nodeEvent2);
|
||||||
|
|
||||||
|
expectedCalls = 2;
|
||||||
|
verifyExpectedCalls(expectedCalls, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyExpectedCalls(long expectedCalls, int memory, int vcores)
|
||||||
|
throws InterruptedException {
|
||||||
|
boolean verified = false;
|
||||||
|
int count = 0;
|
||||||
|
while (count < 100) {
|
||||||
|
if (scheduler.fsOpDurations.hasUpdateThreadRunChanged()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
Thread.sleep(10);
|
||||||
|
}
|
||||||
|
assertTrue("Update Thread has not run based on its metrics",
|
||||||
|
scheduler.fsOpDurations.hasUpdateThreadRunChanged());
|
||||||
|
assertEquals("Root queue metrics memory does not have expected value",
|
||||||
|
memory, scheduler.getRootQueueMetrics().getAvailableMB());
|
||||||
|
assertEquals("Root queue metrics cpu does not have expected value",
|
||||||
|
vcores, scheduler.getRootQueueMetrics().getAvailableVirtualCores());
|
||||||
|
|
||||||
|
MetricsCollectorImpl collector = new MetricsCollectorImpl();
|
||||||
|
scheduler.fsOpDurations.getMetrics(collector, true);
|
||||||
|
MetricsRecord record = collector.getRecords().get(0);
|
||||||
|
for (AbstractMetric abstractMetric : record.metrics()) {
|
||||||
|
if (abstractMetric.name().contains("UpdateThreadRunNumOps")) {
|
||||||
|
assertEquals("Update Thread did not run expected number of times " +
|
||||||
|
"based on metric record count",
|
||||||
|
expectedCalls,
|
||||||
|
abstractMetric.value());
|
||||||
|
verified = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue("Did not find metric for UpdateThreadRunNumOps", verified);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyNoCalls() {
|
||||||
|
assertFalse("Update thread should not have executed",
|
||||||
|
scheduler.fsOpDurations.hasUpdateThreadRunChanged());
|
||||||
|
assertEquals("Scheduler queue memory should not have been updated",
|
||||||
|
0, scheduler.getRootQueueMetrics().getAvailableMB());
|
||||||
|
assertEquals("Scheduler queue cpu should not have been updated",
|
||||||
|
0,scheduler.getRootQueueMetrics().getAvailableVirtualCores());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue