YARN-655. Fair scheduler metrics should subtract allocated memory from available memory. (sandyr via tucu)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1480810 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Alejandro Abdelnur 2013-05-09 22:17:36 +00:00
parent 5d11e610a4
commit de8b9c94a4
4 changed files with 56 additions and 4 deletions

View File

@ -289,6 +289,9 @@ Release 2.0.5-beta - UNRELEASED
YARN-637. FS: maxAssign is not honored. (kkambatl via tucu)
YARN-655. Fair scheduler metrics should subtract allocated memory from
available memory. (sandyr via tucu)
Release 2.0.4-alpha - 2013-04-25
INCOMPATIBLE CHANGES

View File

@ -43,6 +43,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.util.BuilderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -425,6 +426,10 @@ public class QueueMetrics implements MetricsSource {
public int getAppsFailed() {
return appsFailed.value();
}
public Resource getAllocatedResources() {
return BuilderUtils.newResource(allocatedMB.value(), 0);
}
public int getAllocatedMB() {
return allocatedMB.value();

View File

@ -225,10 +225,6 @@ public class FairScheduler implements ResourceScheduler {
// Recursively compute fair shares for all queues
// and update metrics
rootQueue.recomputeShares();
// Update recorded capacity of root queue (child queues are updated
// when fair share is calculated).
rootMetrics.setAvailableResourcesToQueue(clusterCapacity);
}
/**
@ -617,6 +613,7 @@ public class FairScheduler implements ResourceScheduler {
} else {
application.containerCompleted(rmContainer, containerStatus, event);
node.releaseContainer(container);
updateRootQueueMetrics();
}
LOG.info("Application " + applicationAttemptId +
@ -628,6 +625,7 @@ public class FairScheduler implements ResourceScheduler {
private synchronized void addNode(RMNode node) {
nodes.put(node.getNodeID(), new FSSchedulerNode(node));
Resources.addTo(clusterCapacity, node.getTotalCapability());
updateRootQueueMetrics();
LOG.info("Added node " + node.getNodeAddress() +
" cluster capacity: " + clusterCapacity);
@ -636,6 +634,7 @@ public class FairScheduler implements ResourceScheduler {
private synchronized void removeNode(RMNode rmNode) {
FSSchedulerNode node = nodes.get(rmNode.getNodeID());
Resources.subtractFrom(clusterCapacity, rmNode.getTotalCapability());
updateRootQueueMetrics();
// Remove running containers
List<RMContainer> runningContainers = node.getRunningContainers();
@ -840,6 +839,7 @@ public class FairScheduler implements ResourceScheduler {
if ((assignedContainers >= maxAssign) && (maxAssign > 0)) { break; }
}
}
updateRootQueueMetrics();
}
@Override
@ -861,6 +861,18 @@ public class FairScheduler implements ResourceScheduler {
}
return new SchedulerAppReport(applications.get(appAttemptId));
}
/**
* Subqueue metrics might be a little out of date because fair shares are
* recalculated at the update interval, but the root queue metrics needs to
* be updated synchronously with allocations and completions so that cluster
* metrics will be consistent.
*/
private void updateRootQueueMetrics() {
rootMetrics.setAvailableResourcesToQueue(
Resources.subtract(
clusterCapacity, rootMetrics.getAllocatedResources()));
}
@Override
public QueueMetrics getRootQueueMetrics() {

View File

@ -67,6 +67,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent;
@ -127,6 +128,7 @@ public class TestFairScheduler {
public void tearDown() {
scheduler = null;
resourceManager = null;
QueueMetrics.clearQueueMetrics();
}
private Configuration createConfiguration() {
@ -336,6 +338,13 @@ public class TestFairScheduler {
assertEquals(1024, scheduler.getQueueManager().getQueue("queue1").
getResourceUsage().getMemory());
// verify metrics
QueueMetrics queue1Metrics = scheduler.getQueueManager().getQueue("queue1")
.getMetrics();
assertEquals(1024, queue1Metrics.getAllocatedMB());
assertEquals(1024, scheduler.getRootQueueMetrics().getAllocatedMB());
assertEquals(512, scheduler.getRootQueueMetrics().getAvailableMB());
}
@Test (timeout = 5000)
@ -1245,6 +1254,7 @@ public class TestFairScheduler {
scheduler.handle(updateEvent);
assertEquals(1, app.getLiveContainers().size());
assertEquals(0, scheduler.getRootQueueMetrics().getAvailableMB());
// Create request at higher priority
createSchedulingRequestExistingApplication(1024, 1, attId);
@ -1260,6 +1270,7 @@ public class TestFairScheduler {
// Complete container
scheduler.allocate(attId, new ArrayList<ResourceRequest>(),
Arrays.asList(containerId));
assertEquals(1024, scheduler.getRootQueueMetrics().getAvailableMB());
// Schedule at opening
scheduler.update();
@ -1271,6 +1282,7 @@ public class TestFairScheduler {
for (RMContainer liveContainer : liveContainers) {
Assert.assertEquals(2, liveContainer.getContainer().getPriority().getPriority());
}
assertEquals(0, scheduler.getRootQueueMetrics().getAvailableMB());
}
@Test
@ -1575,4 +1587,24 @@ public class TestFairScheduler {
assertEquals(1, app.getLiveContainers().size());
assertEquals(0, app.getReservedContainers().size());
}
@Test
public void testRemoveNodeUpdatesRootQueueMetrics() {
assertEquals(0, scheduler.getRootQueueMetrics().getAvailableMB());
RMNode node1 = MockNodes.newNodeInfo(1, Resources.createResource(1024));
NodeAddedSchedulerEvent addEvent = new NodeAddedSchedulerEvent(node1);
scheduler.handle(addEvent);
assertEquals(1024, scheduler.getRootQueueMetrics().getAvailableMB());
scheduler.update(); // update shouldn't change things
assertEquals(1024, scheduler.getRootQueueMetrics().getAvailableMB());
NodeRemovedSchedulerEvent removeEvent = new NodeRemovedSchedulerEvent(node1);
scheduler.handle(removeEvent);
assertEquals(0, scheduler.getRootQueueMetrics().getAvailableMB());
scheduler.update(); // update shouldn't change things
assertEquals(0, scheduler.getRootQueueMetrics().getAvailableMB());
}
}