YARN-8459. Improve Capacity Scheduler logs to debug invalid states. Contributed by Wangda Tan.
This commit is contained in:
parent
344f324710
commit
51654a3962
|
@ -1234,8 +1234,10 @@ public class CapacityScheduler extends
|
||||||
updateDemandForQueue.getOrderingPolicy().demandUpdated(application);
|
updateDemandForQueue.getOrderingPolicy().demandUpdated(application);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG.info("Allocation for application " + applicationAttemptId + " : " +
|
if (LOG.isDebugEnabled()) {
|
||||||
allocation + " with cluster resource : " + getClusterResource());
|
LOG.info("Allocation for application " + applicationAttemptId + " : "
|
||||||
|
+ allocation + " with cluster resource : " + getClusterResource());
|
||||||
|
}
|
||||||
return allocation;
|
return allocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1479,10 +1481,17 @@ public class CapacityScheduler extends
|
||||||
private CSAssignment allocateContainerOnSingleNode(
|
private CSAssignment allocateContainerOnSingleNode(
|
||||||
CandidateNodeSet<FiCaSchedulerNode> candidates, FiCaSchedulerNode node,
|
CandidateNodeSet<FiCaSchedulerNode> candidates, FiCaSchedulerNode node,
|
||||||
boolean withNodeHeartbeat) {
|
boolean withNodeHeartbeat) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug(
|
||||||
|
"Trying to schedule on node: " + node.getNodeName() + ", available: "
|
||||||
|
+ node.getUnallocatedResource());
|
||||||
|
}
|
||||||
|
|
||||||
// Backward compatible way to make sure previous behavior which allocation
|
// Backward compatible way to make sure previous behavior which allocation
|
||||||
// driven by node heartbeat works.
|
// driven by node heartbeat works.
|
||||||
if (getNode(node.getNodeID()) != node) {
|
if (getNode(node.getNodeID()) != node) {
|
||||||
LOG.error("Trying to schedule on a removed node, please double check.");
|
LOG.error("Trying to schedule on a removed node, please double check, "
|
||||||
|
+ "nodeId=" + node.getNodeID());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1496,14 +1505,19 @@ public class CapacityScheduler extends
|
||||||
FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer(
|
FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer(
|
||||||
reservedContainer.getContainerId());
|
reservedContainer.getContainerId());
|
||||||
if (reservedApplication == null) {
|
if (reservedApplication == null) {
|
||||||
LOG.error("Trying to schedule for a finished app, please double check.");
|
LOG.error(
|
||||||
|
"Trying to schedule for a finished app, please double check. nodeId="
|
||||||
|
+ node.getNodeID() + " container=" + reservedContainer
|
||||||
|
.getContainerId());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to fulfill the reservation
|
// Try to fulfill the reservation
|
||||||
LOG.info(
|
if (LOG.isDebugEnabled()) {
|
||||||
"Trying to fulfill reservation for application " + reservedApplication
|
LOG.debug("Trying to fulfill reservation for application "
|
||||||
.getApplicationId() + " on node: " + node.getNodeID());
|
+ reservedApplication.getApplicationId() + " on node: " + node
|
||||||
|
.getNodeID());
|
||||||
|
}
|
||||||
|
|
||||||
LeafQueue queue = ((LeafQueue) reservedApplication.getQueue());
|
LeafQueue queue = ((LeafQueue) reservedApplication.getQueue());
|
||||||
assignment = queue.assignContainers(getClusterResource(), candidates,
|
assignment = queue.assignContainers(getClusterResource(), candidates,
|
||||||
|
@ -1567,12 +1581,6 @@ public class CapacityScheduler extends
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (LOG.isDebugEnabled()) {
|
|
||||||
LOG.debug(
|
|
||||||
"Trying to schedule on node: " + node.getNodeName() + ", available: "
|
|
||||||
+ node.getUnallocatedResource());
|
|
||||||
}
|
|
||||||
|
|
||||||
return allocateOrReserveNewContainers(candidates, withNodeHeartbeat);
|
return allocateOrReserveNewContainers(candidates, withNodeHeartbeat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2888,6 +2896,11 @@ public class CapacityScheduler extends
|
||||||
LOG.info("Failed to accept allocation proposal");
|
LOG.info("Failed to accept allocation proposal");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Allocation proposal accepted=" + isSuccess + ", proposal="
|
||||||
|
+ request);
|
||||||
|
}
|
||||||
|
|
||||||
// Update unconfirmed allocated resource.
|
// Update unconfirmed allocated resource.
|
||||||
if (updateUnconfirmedAllocatedResource) {
|
if (updateUnconfirmedAllocatedResource) {
|
||||||
app.decUnconfirmedRes(request.getTotalAllocatedResource());
|
app.decUnconfirmedRes(request.getTotalAllocatedResource());
|
||||||
|
|
|
@ -90,6 +90,8 @@ public class ParentQueue extends AbstractCSQueue {
|
||||||
|
|
||||||
private QueueOrderingPolicy queueOrderingPolicy;
|
private QueueOrderingPolicy queueOrderingPolicy;
|
||||||
|
|
||||||
|
private long lastSkipQueueDebugLoggingTimestamp = -1;
|
||||||
|
|
||||||
public ParentQueue(CapacitySchedulerContext cs,
|
public ParentQueue(CapacitySchedulerContext cs,
|
||||||
String queueName, CSQueue parent, CSQueue old) throws IOException {
|
String queueName, CSQueue parent, CSQueue old) throws IOException {
|
||||||
super(cs, queueName, parent, old);
|
super(cs, queueName, parent, old);
|
||||||
|
@ -539,9 +541,14 @@ public class ParentQueue extends AbstractCSQueue {
|
||||||
if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
|
if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
|
||||||
&& !accessibleToPartition(candidates.getPartition())) {
|
&& !accessibleToPartition(candidates.getPartition())) {
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("Skip this queue=" + getQueuePath()
|
long now = System.currentTimeMillis();
|
||||||
+ ", because it is not able to access partition=" + candidates
|
// Do logging every 1 sec to avoid excessive logging.
|
||||||
.getPartition());
|
if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
|
||||||
|
LOG.debug("Skip this queue=" + getQueuePath()
|
||||||
|
+ ", because it is not able to access partition=" + candidates
|
||||||
|
.getPartition());
|
||||||
|
this.lastSkipQueueDebugLoggingTimestamp = now;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
|
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
|
||||||
|
@ -561,10 +568,15 @@ public class ParentQueue extends AbstractCSQueue {
|
||||||
if (!super.hasPendingResourceRequest(candidates.getPartition(),
|
if (!super.hasPendingResourceRequest(candidates.getPartition(),
|
||||||
clusterResource, schedulingMode)) {
|
clusterResource, schedulingMode)) {
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("Skip this queue=" + getQueuePath()
|
long now = System.currentTimeMillis();
|
||||||
+ ", because it doesn't need more resource, schedulingMode="
|
// Do logging every 1 sec to avoid excessive logging.
|
||||||
+ schedulingMode.name() + " node-partition=" + candidates
|
if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
|
||||||
.getPartition());
|
LOG.debug("Skip this queue=" + getQueuePath()
|
||||||
|
+ ", because it doesn't need more resource, schedulingMode="
|
||||||
|
+ schedulingMode.name() + " node-partition=" + candidates
|
||||||
|
.getPartition());
|
||||||
|
this.lastSkipQueueDebugLoggingTimestamp = now;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
|
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
|
||||||
|
@ -666,12 +678,12 @@ public class ParentQueue extends AbstractCSQueue {
|
||||||
assignment.setIncreasedAllocation(
|
assignment.setIncreasedAllocation(
|
||||||
assignedToChild.isIncreasedAllocation());
|
assignedToChild.isIncreasedAllocation());
|
||||||
|
|
||||||
LOG.info("assignedContainer" + " queue=" + getQueueName()
|
|
||||||
+ " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity="
|
|
||||||
+ getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed()
|
|
||||||
+ " cluster=" + clusterResource);
|
|
||||||
|
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("assignedContainer reserved=" + isReserved + " queue="
|
||||||
|
+ getQueueName() + " usedCapacity=" + getUsedCapacity()
|
||||||
|
+ " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
|
||||||
|
+ queueUsage.getUsed() + " cluster=" + clusterResource);
|
||||||
|
|
||||||
LOG.debug(
|
LOG.debug(
|
||||||
"ParentQ=" + getQueueName() + " assignedSoFarInThisIteration="
|
"ParentQ=" + getQueueName() + " assignedSoFarInThisIteration="
|
||||||
+ assignment.getResource() + " usedCapacity="
|
+ assignment.getResource() + " usedCapacity="
|
||||||
|
|
|
@ -93,11 +93,14 @@ public abstract class AbstractContainerAllocator {
|
||||||
assignment.setType(result.getContainerNodeType());
|
assignment.setType(result.getContainerNodeType());
|
||||||
|
|
||||||
if (result.getAllocationState() == AllocationState.RESERVED) {
|
if (result.getAllocationState() == AllocationState.RESERVED) {
|
||||||
// This is a reserved container
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.info("Reserved container " + " application="
|
// This is a reserved container
|
||||||
+ application.getApplicationId() + " resource=" + allocatedResource
|
// Since re-reservation could happen again and again for already
|
||||||
+ " queue=" + appInfo.getQueueName()
|
// reserved containers. only do this in debug log.
|
||||||
+ " cluster=" + clusterResource);
|
LOG.debug("Reserved container " + " application=" + application
|
||||||
|
.getApplicationId() + " resource=" + allocatedResource + " queue="
|
||||||
|
+ appInfo.getQueueName() + " cluster=" + clusterResource);
|
||||||
|
}
|
||||||
assignment.getAssignmentInformation().addReservationDetails(
|
assignment.getAssignmentInformation().addReservationDetails(
|
||||||
updatedContainer, application.getCSLeafQueue().getQueuePath());
|
updatedContainer, application.getCSLeafQueue().getQueuePath());
|
||||||
assignment.getAssignmentInformation().incrReservations();
|
assignment.getAssignmentInformation().incrReservations();
|
||||||
|
|
|
@ -607,6 +607,11 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
|
||||||
schedulerContainer.getRmContainer(),
|
schedulerContainer.getRmContainer(),
|
||||||
schedulerContainer.getRmContainer().getContainer(),
|
schedulerContainer.getRmContainer().getContainer(),
|
||||||
reReservation);
|
reReservation);
|
||||||
|
|
||||||
|
LOG.info("Reserved container=" + rmContainer.getContainerId()
|
||||||
|
+ ", on node=" + schedulerContainer.getSchedulerNode()
|
||||||
|
+ " with resource=" + rmContainer
|
||||||
|
.getAllocatedOrReservedResource());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
|
Loading…
Reference in New Issue