From fa59f4e4907d2c37841d59656d79b3162e774310 Mon Sep 17 00:00:00 2001 From: Ray Chiang Date: Tue, 14 Mar 2017 14:45:13 -0700 Subject: [PATCH] YARN-6042. Dump scheduler and queue state information into FairScheduler DEBUG log. (Yufei Gu via rchiang) --- .../src/main/conf/log4j.properties | 9 +++ .../scheduler/fair/FSAppAttempt.java | 51 +++++++------ .../scheduler/fair/FSLeafQueue.java | 21 ++++++ .../scheduler/fair/FSParentQueue.java | 21 ++++++ .../scheduler/fair/FSQueue.java | 41 ++++++++++- .../scheduler/fair/FairScheduler.java | 28 +++++--- .../scheduler/fair/TestFairScheduler.java | 72 +++++++++++++++++++ 7 files changed, 209 insertions(+), 34 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties b/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties index 95afc6167e5..7c02b20cfbd 100644 --- a/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties +++ b/hadoop-common-project/hadoop-common/src/main/conf/log4j.properties @@ -321,3 +321,12 @@ log4j.appender.EWMA=org.apache.hadoop.yarn.util.Log4jWarningErrorMetricsAppender log4j.appender.EWMA.cleanupInterval=${yarn.ewma.cleanupInterval} log4j.appender.EWMA.messageAgeLimitSeconds=${yarn.ewma.messageAgeLimitSeconds} log4j.appender.EWMA.maxUniqueMessages=${yarn.ewma.maxUniqueMessages} + +# Fair scheduler requests log on state dump +log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.statedump=DEBUG,FSLOGGER +log4j.appender.FSLOGGER=org.apache.log4j.RollingFileAppender +log4j.appender.FSLOGGER.File=${hadoop.log.dir}/fairscheduler-statedump.log +log4j.appender.FSLOGGER.layout=org.apache.log4j.PatternLayout +log4j.appender.FSLOGGER.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +log4j.appender.FSLOGGER.MaxFileSize=${hadoop.log.maxfilesize} +log4j.appender.FSLOGGER.MaxBackupIndex=${hadoop.log.maxbackupindex} \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index 60902a25a8c..2e083431cb0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -834,24 +834,26 @@ public class FSAppAttempt extends SchedulerApplicationAttempt return capability; } + if (LOG.isDebugEnabled()) { + LOG.debug("Resource request: " + capability + " exceeds the available" + + " resources of the node."); + } + // The desired container won't fit here, so reserve if (isReservable(capability) && reserve(request, node, reservedContainer, type, schedulerKey)) { - if (isWaitingForAMContainer()) { - updateAMDiagnosticMsg(capability, - " exceed the available resources of the node and the request is" - + " reserved"); + updateAMDiagnosticMsg(capability, " exceeds the available resources of " + + "the node and the request is reserved)"); + if (LOG.isDebugEnabled()) { + LOG.debug(getName() + "'s resource request is reserved."); } return FairScheduler.CONTAINER_RESERVED; } else { - if (isWaitingForAMContainer()) { - updateAMDiagnosticMsg(capability, - " exceed the available resources of the node and the request cannot" - + " be reserved"); - } + updateAMDiagnosticMsg(capability, " exceeds the available resources of " + + "the node and the request cannot be reserved)"); if (LOG.isDebugEnabled()) { - LOG.debug("Couldn't creating reservation for " + - getName() + ",at priority " + request.getPriority()); + LOG.debug("Couldn't create reservation for app: " + getName() + + ", at priority " + schedulerKey.getPriority()); } return Resources.none(); } @@ -1023,10 +1025,9 @@ public class FSAppAttempt extends SchedulerApplicationAttempt ret = false; } else if (!getQueue().fitsInMaxShare(anyRequest.getCapability())) { // The requested container must fit in queue maximum share - if (isWaitingForAMContainer()) { - updateAMDiagnosticMsg(anyRequest.getCapability(), - " exceeds current queue or its parents maximum resource allowed)."); - } + updateAMDiagnosticMsg(anyRequest.getCapability(), + " exceeds current queue or its parents maximum resource allowed)."); + ret = false; } @@ -1301,16 +1302,16 @@ public class FSAppAttempt extends SchedulerApplicationAttempt @Override public Resource assignContainer(FSSchedulerNode node) { if (isOverAMShareLimit()) { - if (isWaitingForAMContainer()) { - List ask = appSchedulingInfo.getAllResourceRequests(); - updateAMDiagnosticMsg(ask.get(0).getCapability(), " exceeds maximum " - + "AM resource allowed)."); + Resource amResourceRequest = appSchedulingInfo.getAllResourceRequests() + .get(0).getCapability(); + updateAMDiagnosticMsg(amResourceRequest, + " exceeds maximum AM resource allowed)."); + if (LOG.isDebugEnabled()) { + LOG.debug("AM resource request: " + amResourceRequest + + " exceeds maximum AM resource allowed, " + + getQueue().dumpState()); } - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping allocation because maxAMShare limit would " + - "be exceeded"); - } return Resources.none(); } return assignContainer(node, false); @@ -1323,6 +1324,10 @@ public class FSAppAttempt extends SchedulerApplicationAttempt * @param reason the reason why AM doesn't get the resource */ private void updateAMDiagnosticMsg(Resource resource, String reason) { + if (!isWaitingForAMContainer()) { + return; + } + StringBuilder diagnosticMessageBldr = new StringBuilder(); diagnosticMessageBldr.append(" (Resource request: "); diagnosticMessageBldr.append(resource); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java index 48847e59a8f..7785a5c3b25 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java @@ -619,4 +619,25 @@ public class FSLeafQueue extends FSQueue { boolean isStarved() { return isStarvedForMinShare() || isStarvedForFairShare(); } + + @Override + protected void dumpStateInternal(StringBuilder sb) { + sb.append("{Name: " + getName() + + ", Weight: " + weights + + ", Policy: " + policy.getName() + + ", FairShare: " + getFairShare() + + ", SteadyFairShare: " + getSteadyFairShare() + + ", MaxShare: " + maxShare + + ", MinShare: " + minShare + + ", ResourceUsage: " + getResourceUsage() + + ", Demand: " + getDemand() + + ", Runnable: " + getNumRunnableApps() + + ", NumPendingApps: " + getNumPendingApps() + + ", NonRunnable: " + getNumNonRunnableApps() + + ", MaxAMShare: " + maxAMShare + + ", MaxAMResource: " + computeMaxAMResource() + + ", AMResourceUsage: " + getAmResourceUsage() + + ", LastTimeAtMinShare: " + lastTimeAtMinShare + + "}"); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java index 45fa212e7e1..6050ab5391b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java @@ -292,4 +292,25 @@ public class FSParentQueue extends FSQueue { // TODO Auto-generated method stub } + + @Override + protected void dumpStateInternal(StringBuilder sb) { + sb.append("{Name: " + getName() + + ", Weight: " + weights + + ", Policy: " + policy.getName() + + ", FairShare: " + getFairShare() + + ", SteadyFairShare: " + getSteadyFairShare() + + ", MaxShare: " + maxShare + + ", MinShare: " + minShare + + ", ResourceUsage: " + getResourceUsage() + + ", Demand: " + getDemand() + + ", MaxAMShare: " + maxAMShare + + ", Runnable: " + getNumRunnableApps() + + "}"); + + for(FSQueue child : getChildQueues()) { + sb.append(", "); + child.dumpStateInternal(sb); + } + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java index b5592c55bc2..acf4d5c203b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java @@ -393,11 +393,22 @@ public abstract class FSQueue implements Queue, Schedulable { * @return true if check passes (can assign) or false otherwise */ boolean assignContainerPreCheck(FSSchedulerNode node) { - if (!Resources.fitsIn(getResourceUsage(), maxShare) - || node.getReservedContainer() != null) { + if (node.getReservedContainer() != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("Assigning container failed on node '" + node.getNodeName() + + " because it has reserved containers."); + } return false; + } else if (!Resources.fitsIn(getResourceUsage(), maxShare)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Assigning container failed on node '" + node.getNodeName() + + " because queue resource usage is larger than MaxShare: " + + dumpState()); + } + return false; + } else { + return true; } - return true; } /** @@ -453,6 +464,11 @@ public abstract class FSQueue implements Queue, Schedulable { Resources.add(getResourceUsage(), additionalResource); if (!Resources.fitsIn(usagePlusAddition, getMaxShare())) { + if (LOG.isDebugEnabled()) { + LOG.debug("Resource usage plus resource request: " + usagePlusAddition + + " exceeds maximum resource allowed:" + getMaxShare() + + " in queue " + getName()); + } return false; } @@ -491,4 +507,23 @@ public abstract class FSQueue implements Queue, Schedulable { setPolicy(queuePolicy); return true; } + + /** + * Recursively dump states of all queues. + * + * @return a string which holds all queue states + */ + public String dumpState() { + StringBuilder sb = new StringBuilder(); + dumpStateInternal(sb); + return sb.toString(); + } + + + /** + * Recursively dump states of all queues. + * + * @param sb the {code StringBuilder} which holds queue states + */ + protected abstract void dumpStateInternal(StringBuilder sb); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 8d98116fa47..a7054cefdc8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -140,7 +140,9 @@ public class FairScheduler extends private boolean usePortForNodeName; private static final Log LOG = LogFactory.getLog(FairScheduler.class); - + private static final Log STATE_DUMP_LOG = + LogFactory.getLog(FairScheduler.class.getName() + ".statedump"); + private static final ResourceCalculator RESOURCE_CALCULATOR = new DefaultResourceCalculator(); private static final ResourceCalculator DOMINANT_RESOURCE_CALCULATOR = @@ -152,7 +154,7 @@ public class FairScheduler extends // How often fair shares are re-calculated (ms) protected long updateInterval; - private final int UPDATE_DEBUG_FREQUENCY = 5; + private final int UPDATE_DEBUG_FREQUENCY = 25; private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY; @VisibleForTesting @@ -348,6 +350,21 @@ public class FairScheduler extends } } + /** + * Dump scheduler state including states of all queues. + */ + private void dumpSchedulerState() { + FSQueue rootQueue = queueMgr.getRootQueue(); + Resource clusterResource = getClusterResource(); + LOG.debug("FairScheduler state: Cluster Capacity: " + clusterResource + + " Allocations: " + rootMetrics.getAllocatedResources() + + " Availability: " + Resource.newInstance( + rootMetrics.getAvailableMB(), rootMetrics.getAvailableVirtualCores()) + + " Demand: " + rootQueue.getDemand()); + + STATE_DUMP_LOG.debug(rootQueue.dumpState()); + } + /** * Recompute the internal variables used by the scheduler - per-job weights, * fair shares, deficits, minimum slot allocations, and amount of used and @@ -372,12 +389,7 @@ public class FairScheduler extends if (LOG.isDebugEnabled()) { if (--updatesToSkipForDebug < 0) { updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY; - LOG.debug("Cluster Capacity: " + clusterResource + - " Allocations: " + rootMetrics.getAllocatedResources() + - " Availability: " + Resource.newInstance( - rootMetrics.getAvailableMB(), - rootMetrics.getAvailableVirtualCores()) + - " Demand: " + rootQueue.getDemand()); + dumpSchedulerState(); } } } finally { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index 4b134da098e..670793d36e3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -5148,4 +5148,76 @@ public class TestFairScheduler extends FairSchedulerTestBase { Resources.equals(aQueue.getDemand(), maxResource) && Resources.equals(bQueue.getDemand(), maxResource)); } + + @Test + public void testDumpState() throws IOException { + conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE); + + PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE)); + out.println(""); + out.println(""); + out.println(""); + out.println(" "); + out.println(" 1"); + out.println(" "); + out.println(""); + out.println(""); + out.close(); + + ControlledClock clock = new ControlledClock(); + scheduler.setClock(clock); + + scheduler.init(conf); + scheduler.start(); + scheduler.reinitialize(conf, resourceManager.getRMContext()); + + FSLeafQueue child1 = + scheduler.getQueueManager().getLeafQueue("parent.child1", false); + Resource resource = Resource.newInstance(4 * GB, 4); + child1.setMaxShare(resource); + FSAppAttempt app = mock(FSAppAttempt.class); + Mockito.when(app.getDemand()).thenReturn(resource); + Mockito.when(app.getResourceUsage()).thenReturn(resource); + child1.addAppSchedulable(app); + child1.updateDemand(); + + String childQueueString = "{Name: root.parent.child1," + + " Weight: ," + + " Policy: fair," + + " FairShare: ," + + " SteadyFairShare: ," + + " MaxShare: ," + + " MinShare: ," + + " ResourceUsage: ," + + " Demand: ," + + " Runnable: 1," + + " NumPendingApps: 0," + + " NonRunnable: 0," + + " MaxAMShare: 0.5," + + " MaxAMResource: ," + + " AMResourceUsage: ," + + " LastTimeAtMinShare: " + clock.getTime() + + "}"; + + assertTrue(child1.dumpState().equals(childQueueString)); + FSParentQueue parent = + scheduler.getQueueManager().getParentQueue("parent", false); + parent.setMaxShare(resource); + parent.updateDemand(); + + String parentQueueString = "{Name: root.parent," + + " Weight: ," + + " Policy: fair," + + " FairShare: ," + + " SteadyFairShare: ," + + " MaxShare: ," + + " MinShare: ," + + " ResourceUsage: ," + + " Demand: ," + + " MaxAMShare: 0.5," + + " Runnable: 0}"; + + assertTrue(parent.dumpState().equals( + parentQueueString + ", " + childQueueString)); + } }