YARN-2712. TestWorkPreservingRMRestart: Augment FS tests with queue and headroom checks. (Tsuyoshi Ozawa via kasha)

This commit is contained in:
Karthik Kambatla 2014-10-30 00:29:07 -07:00
parent 0126cf16b7
commit 179cab81e0
3 changed files with 114 additions and 34 deletions

View File

@ -56,6 +56,9 @@ Release 2.7.0 - UNRELEASED
YARN-2742. FairSchedulerConfiguration should allow extra spaces
between value and unit. (Wei Yan via kasha)
YARN-2712. TestWorkPreservingRMRestart: Augment FS tests with
queue and headroom checks. (Tsuyoshi Ozawa via kasha)
OPTIMIZATIONS
BUG FIXES

View File

@ -305,6 +305,7 @@ public class FairScheduler extends
// Recursively compute fair shares for all queues
// and update metrics
rootQueue.recomputeShares();
updateRootQueueMetrics();
if (LOG.isDebugEnabled()) {
if (--updatesToSkipForDebug < 0) {

View File

@ -18,8 +18,10 @@
package org.apache.hadoop.yarn.server.resourcemanager;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
@ -47,6 +49,9 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSParentQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.policies.DominantResourceFairnessPolicy;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
@ -65,6 +70,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.Capacity
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
import org.apache.hadoop.yarn.util.ControlledClock;
import org.apache.hadoop.yarn.util.SystemClock;
@ -148,6 +154,9 @@ public class TestWorkPreservingRMRestart {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
if (schedulerClass.equals(FairScheduler.class)) {
initFairScheduler(rm1);
}
rm1.start();
MockNM nm1 =
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
@ -160,6 +169,9 @@ public class TestWorkPreservingRMRestart {
// Re-start RM
rm2 = new MockRM(conf, memStore);
if (schedulerClass.equals(FairScheduler.class)) {
initFairScheduler(rm2);
}
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// recover app
@ -227,7 +239,9 @@ public class TestWorkPreservingRMRestart {
if (schedulerClass.equals(CapacityScheduler.class)) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else if (schedulerClass.equals(FifoScheduler.class)) {
checkFifoQueue(schedulerApp, usedResources, availableResources);
checkFifoQueue(rm2, schedulerApp, usedResources, availableResources);
} else if (schedulerClass.equals(FairScheduler.class)) {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
}
// *********** check scheduler attempt state.********
@ -239,11 +253,6 @@ public class TestWorkPreservingRMRestart {
scheduler.getRMContainer(runningContainer.getContainerId())));
assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
// Until YARN-1959 is resolved
if (scheduler.getClass() != FairScheduler.class) {
assertEquals(availableResources, schedulerAttempt.getHeadroom());
}
// *********** check appSchedulingInfo state ***********
assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
@ -253,23 +262,28 @@ public class TestWorkPreservingRMRestart {
Resource clusterResource, Resource queueResource, Resource usedResource,
int numContainers)
throws Exception {
checkCSLeafQueue(rm2, app, clusterResource, queueResource, usedResource,
numContainers);
checkCSLeafQueue(rm, app, clusterResource, queueResource, usedResource,
numContainers);
LeafQueue queue = (LeafQueue) app.getQueue();
Resource availableResources = Resources.subtract(queueResource, usedResource);
Resource availableResources =
Resources.subtract(queueResource, usedResource);
// ************ check app headroom ****************
SchedulerApplicationAttempt schedulerAttempt = app.getCurrentAppAttempt();
assertEquals(availableResources, schedulerAttempt.getHeadroom());
// ************* check Queue metrics ************
QueueMetrics queueMetrics = queue.getMetrics();
asserteMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResource.getMemory(),
usedResource.getVirtualCores());
assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResource.getMemory(),
usedResource.getVirtualCores());
// ************ check user metrics ***********
QueueMetrics userMetrics =
queueMetrics.getUserMetrics(app.getUser());
asserteMetrics(userMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResource.getMemory(),
usedResource.getVirtualCores());
assertMetrics(userMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResource.getMemory(),
usedResource.getVirtualCores());
}
private void checkCSLeafQueue(MockRM rm,
@ -297,9 +311,10 @@ public class TestWorkPreservingRMRestart {
.getTotalConsumedResources());
}
private void checkFifoQueue(SchedulerApplication schedulerApp,
Resource usedResources, Resource availableResources) throws Exception {
FifoScheduler scheduler = (FifoScheduler) rm2.getResourceScheduler();
private void checkFifoQueue(ResourceManager rm,
SchedulerApplication schedulerApp, Resource usedResources,
Resource availableResources) throws Exception {
FifoScheduler scheduler = (FifoScheduler) rm.getResourceScheduler();
// ************ check cluster used Resources ********
assertEquals(usedResources, scheduler.getUsedResource());
@ -310,9 +325,68 @@ public class TestWorkPreservingRMRestart {
// ************ check queue metrics ****************
QueueMetrics queueMetrics = scheduler.getRootQueueMetrics();
asserteMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResources.getMemory(),
usedResources.getVirtualCores());
assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResources.getMemory(),
usedResources.getVirtualCores());
}
private void checkFSQueue(ResourceManager rm,
SchedulerApplication schedulerApp, Resource usedResources,
Resource availableResources) throws Exception {
// waiting for RM's scheduling apps
int retry = 0;
Resource assumedFairShare = Resource.newInstance(8192, 8);
while (true) {
Thread.sleep(100);
if (assumedFairShare.equals(((FairScheduler)rm.getResourceScheduler())
.getQueueManager().getRootQueue().getFairShare())) {
break;
}
retry++;
if (retry > 30) {
Assert.fail("Apps are not scheduled within assumed timeout");
}
}
FairScheduler scheduler = (FairScheduler) rm.getResourceScheduler();
FSParentQueue root = scheduler.getQueueManager().getRootQueue();
// ************ check cluster used Resources ********
assertTrue(root.getPolicy() instanceof DominantResourceFairnessPolicy);
assertEquals(usedResources,root.getResourceUsage());
// ************ check app headroom ****************
FSAppAttempt schedulerAttempt =
(FSAppAttempt) schedulerApp.getCurrentAppAttempt();
assertEquals(availableResources, schedulerAttempt.getHeadroom());
// ************ check queue metrics ****************
QueueMetrics queueMetrics = scheduler.getRootQueueMetrics();
assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemory(),
availableResources.getVirtualCores(), usedResources.getMemory(),
usedResources.getVirtualCores());
}
private void initFairScheduler(ResourceManager rm) throws IOException {
FairScheduler scheduler = (FairScheduler) rm.getResourceScheduler();
String testDir =
new File(
System.getProperty("test.build.data", "/tmp")).getAbsolutePath();
String allocFile = new File(testDir, "test-queues").getAbsolutePath();
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, allocFile);
PrintWriter out = new PrintWriter(new FileWriter(allocFile));
out.println("<?xml version=\"1.0\"?>");
out.println("<allocations>");
out.println("<defaultQueueSchedulingPolicy>fair</defaultQueueSchedulingPolicy>");
out.println("<queue name=\"root\">");
out.println(" <schedulingPolicy>drf</schedulingPolicy>");
out.println(" <weight>1.0</weight>");
out.println(" <fairSharePreemptionTimeout>100</fairSharePreemptionTimeout>");
out.println(" <minSharePreemptionTimeout>120</minSharePreemptionTimeout>");
out.println(" <fairSharePreemptionThreshold>.5</fairSharePreemptionThreshold>");
out.println("</queue>");
out.println("</allocations>");
out.close();
}
// create 3 container reports for AM
@ -462,9 +536,10 @@ public class TestWorkPreservingRMRestart {
checkCSLeafQueue(rm2, schedulerApp1_1, clusterResource, q1Resource,
q1UsedResource, 4);
QueueMetrics queue1Metrics = schedulerApp1_1.getQueue().getMetrics();
asserteMetrics(queue1Metrics, 2, 0, 2, 0, 4,
q1availableResources.getMemory(), q1availableResources.getVirtualCores(),
q1UsedResource.getMemory(), q1UsedResource.getVirtualCores());
assertMetrics(queue1Metrics, 2, 0, 2, 0, 4,
q1availableResources.getMemory(),
q1availableResources.getVirtualCores(), q1UsedResource.getMemory(),
q1UsedResource.getVirtualCores());
// assert queue B state.
SchedulerApplication schedulerApp2 =
@ -472,19 +547,20 @@ public class TestWorkPreservingRMRestart {
checkCSLeafQueue(rm2, schedulerApp2, clusterResource, q2Resource,
q2UsedResource, 2);
QueueMetrics queue2Metrics = schedulerApp2.getQueue().getMetrics();
asserteMetrics(queue2Metrics, 1, 0, 1, 0, 2,
q2availableResources.getMemory(), q2availableResources.getVirtualCores(),
q2UsedResource.getMemory(), q2UsedResource.getVirtualCores());
assertMetrics(queue2Metrics, 1, 0, 1, 0, 2,
q2availableResources.getMemory(),
q2availableResources.getVirtualCores(), q2UsedResource.getMemory(),
q2UsedResource.getVirtualCores());
// assert parent queue state.
LeafQueue leafQueue = (LeafQueue) schedulerApp2.getQueue();
ParentQueue parentQueue = (ParentQueue) leafQueue.getParent();
checkParentQueue(parentQueue, 6, totalUsedResource, (float) 6 / 16,
(float) 6 / 16);
asserteMetrics(parentQueue.getMetrics(), 3, 0, 3, 0, 6,
totalAvailableResource.getMemory(),
totalAvailableResource.getVirtualCores(), totalUsedResource.getMemory(),
totalUsedResource.getVirtualCores());
assertMetrics(parentQueue.getMetrics(), 3, 0, 3, 0, 6,
totalAvailableResource.getMemory(),
totalAvailableResource.getVirtualCores(), totalUsedResource.getMemory(),
totalUsedResource.getVirtualCores());
}
//Test that we receive a meaningful exit-causing exception if a queue
@ -818,7 +894,7 @@ public class TestWorkPreservingRMRestart {
}, 1000, 20000);
}
private void asserteMetrics(QueueMetrics qm, int appsSubmitted,
private void assertMetrics(QueueMetrics qm, int appsSubmitted,
int appsPending, int appsRunning, int appsCompleted,
int allocatedContainers, int availableMB, int availableVirtualCores,
int allocatedMB, int allocatedVirtualCores) {