From 21788f9fd4681091b26df2a4249329528ead836d Mon Sep 17 00:00:00 2001 From: Eric E Payne Date: Mon, 10 Aug 2020 20:52:44 +0000 Subject: [PATCH] YARN-8459. Improve Capacity Scheduler logs to debug invalid states. Contributed by Wangda Tan and Jim Brennan. --- .../scheduler/capacity/CapacityScheduler.java | 35 ++++++++++++------ .../scheduler/capacity/ParentQueue.java | 36 ++++++++++++------- .../allocator/AbstractContainerAllocator.java | 13 ++++--- .../common/fica/FiCaSchedulerApp.java | 5 +++ 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 426aa9cdf38..7f41d8f305d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -1254,10 +1254,17 @@ public class CapacityScheduler extends */ private CSAssignment allocateContainerOnSingleNode(PlacementSet ps, FiCaSchedulerNode node, boolean withNodeHeartbeat) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "Trying to schedule on node: " + node.getNodeName() + ", available: " + + node.getUnallocatedResource()); + } + // Backward compatible way to make sure previous behavior which allocation // driven by node heartbeat works. if (getNode(node.getNodeID()) != node) { - LOG.error("Trying to schedule on a removed node, please double check."); + LOG.error("Trying to schedule on a removed node, please double check, " + + "nodeId=" + node.getNodeID()); return null; } @@ -1271,14 +1278,19 @@ public class CapacityScheduler extends FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer( reservedContainer.getContainerId()); if (reservedApplication == null) { - LOG.error("Trying to schedule for a finished app, please double check."); + LOG.error( + "Trying to schedule for a finished app, please double check. nodeId=" + + node.getNodeID() + " container=" + reservedContainer + .getContainerId()); return null; } // Try to fulfill the reservation - LOG.info( - "Trying to fulfill reservation for application " + reservedApplication - .getApplicationId() + " on node: " + node.getNodeID()); + if (LOG.isDebugEnabled()) { + LOG.debug("Trying to fulfill reservation for application " + + reservedApplication.getApplicationId() + " on node: " + node + .getNodeID()); + } LeafQueue queue = ((LeafQueue) reservedApplication.getQueue()); assignment = queue.assignContainers(getClusterResource(), ps, @@ -1342,12 +1354,6 @@ public class CapacityScheduler extends return null; } - if (LOG.isDebugEnabled()) { - LOG.debug( - "Trying to schedule on node: " + node.getNodeName() + ", available: " - + node.getUnallocatedResource()); - } - return allocateOrReserveNewContainers(ps, withNodeHeartbeat); } @@ -2578,6 +2584,7 @@ public class CapacityScheduler extends LOG.debug("Try to commit allocation proposal=" + request); } + boolean isSuccess = false; if (attemptId != null) { FiCaSchedulerApp app = getApplicationAttempt(attemptId); // Required sanity check for attemptId - when async-scheduling enabled, @@ -2589,6 +2596,7 @@ public class CapacityScheduler extends CapacitySchedulerMetrics.getMetrics() .addCommitSuccess(commitSuccess); LOG.info("Allocation proposal accepted"); + isSuccess = true; } else{ long commitFailed = System.nanoTime() - commitStart; CapacitySchedulerMetrics.getMetrics() @@ -2596,6 +2604,11 @@ public class CapacityScheduler extends LOG.info("Failed to accept allocation proposal"); } + if (LOG.isDebugEnabled()) { + LOG.debug("Allocation proposal accepted=" + isSuccess + ", proposal=" + + request); + } + // Update unconfirmed allocated resource. if (updateUnconfirmedAllocatedResource) { app.decUnconfirmedRes(request.getTotalAllocatedResource()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java index 2de13cf25e1..496bbf42518 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java @@ -85,6 +85,8 @@ public class ParentQueue extends AbstractCSQueue { private QueueOrderingPolicy queueOrderingPolicy; + private long lastSkipQueueDebugLoggingTimestamp = -1; + public ParentQueue(CapacitySchedulerContext cs, String queueName, CSQueue parent, CSQueue old) throws IOException { super(cs, queueName, parent, old); @@ -487,9 +489,14 @@ public class ParentQueue extends AbstractCSQueue { if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY && !accessibleToPartition(ps.getPartition())) { if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it is not able to access partition=" + ps - .getPartition()); + long now = System.currentTimeMillis(); + // Do logging every 1 sec to avoid excessive logging. + if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it is not able to access partition=" + ps + .getPartition()); + this.lastSkipQueueDebugLoggingTimestamp = now; + } } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, @@ -509,10 +516,15 @@ public class ParentQueue extends AbstractCSQueue { if (!super.hasPendingResourceRequest(ps.getPartition(), clusterResource, schedulingMode)) { if (LOG.isDebugEnabled()) { - LOG.debug("Skip this queue=" + getQueuePath() - + ", because it doesn't need more resource, schedulingMode=" - + schedulingMode.name() + " node-partition=" + ps - .getPartition()); + long now = System.currentTimeMillis(); + // Do logging every 1 sec to avoid excessive logging. + if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) { + LOG.debug("Skip this queue=" + getQueuePath() + + ", because it doesn't need more resource, schedulingMode=" + + schedulingMode.name() + " node-partition=" + ps + .getPartition()); + this.lastSkipQueueDebugLoggingTimestamp = now; + } } ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, @@ -613,12 +625,12 @@ public class ParentQueue extends AbstractCSQueue { assignment.setIncreasedAllocation( assignedToChild.isIncreasedAllocation()); - LOG.info("assignedContainer" + " queue=" + getQueueName() - + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity=" - + getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed() - + " cluster=" + clusterResource); - if (LOG.isDebugEnabled()) { + LOG.debug("assignedContainer reserved=" + isReserved + " queue=" + + getQueueName() + " usedCapacity=" + getUsedCapacity() + + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used=" + + queueUsage.getUsed() + " cluster=" + clusterResource); + LOG.debug( "ParentQ=" + getQueueName() + " assignedSoFarInThisIteration=" + assignment.getResource() + " usedCapacity=" diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java index d3ab01e0390..d9a1ce52a29 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java @@ -94,11 +94,14 @@ public abstract class AbstractContainerAllocator { assignment.setType(result.getContainerNodeType()); if (result.getAllocationState() == AllocationState.RESERVED) { - // This is a reserved container - LOG.info("Reserved container " + " application=" - + application.getApplicationId() + " resource=" + allocatedResource - + " queue=" + appInfo.getQueueName() - + " cluster=" + clusterResource); + if (LOG.isDebugEnabled()) { + // This is a reserved container + // Since re-reservation could happen again and again for already + // reserved containers. only do this in debug log. + LOG.debug("Reserved container " + " application=" + application + .getApplicationId() + " resource=" + allocatedResource + " queue=" + + appInfo.getQueueName() + " cluster=" + clusterResource); + } assignment.getAssignmentInformation().addReservationDetails( updatedContainer, application.getCSLeafQueue().getQueuePath()); assignment.getAssignmentInformation().incrReservations(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java index dc1a0a27294..35e0e9459e9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java @@ -591,6 +591,11 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt { schedulerContainer.getRmContainer(), schedulerContainer.getRmContainer().getContainer(), reReservation); + + LOG.info("Reserved container=" + rmContainer.getContainerId() + + ", on node=" + schedulerContainer.getSchedulerNode() + + " with resource=" + rmContainer + .getAllocatedOrReservedResource()); } } } finally {