YARN-11414. ClusterMetricsInfo shows wrong availableMB when node labels enabled

This commit is contained in:
Ashutosh Gupta 2023-01-11 23:50:41 +00:00
parent f26d8bc9bd
commit fba44051d4
1 changed files with 40 additions and 19 deletions

View File

@ -27,6 +27,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSParentQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
@XmlRootElement(name = "clusterMetrics") @XmlRootElement(name = "clusterMetrics")
@XmlAccessorType(XmlAccessType.FIELD) @XmlAccessorType(XmlAccessType.FIELD)
@ -103,45 +105,64 @@ public class ClusterMetricsInfo {
this.appsRunning = metrics.getAppsRunning(); this.appsRunning = metrics.getAppsRunning();
this.appsFailed = metrics.getAppsFailed(); this.appsFailed = metrics.getAppsFailed();
this.appsKilled = metrics.getAppsKilled(); this.appsKilled = metrics.getAppsKilled();
this.reservedMB = metrics.getReservedMB();
this.availableMB = metrics.getAvailableMB();
this.allocatedMB = metrics.getAllocatedMB();
this.pendingMB = metrics.getPendingMB(); this.pendingMB = metrics.getPendingMB();
this.reservedVirtualCores = metrics.getReservedVirtualCores();
this.availableVirtualCores = metrics.getAvailableVirtualCores();
this.allocatedVirtualCores = metrics.getAllocatedVirtualCores();
this.pendingVirtualCores = metrics.getPendingVirtualCores(); this.pendingVirtualCores = metrics.getPendingVirtualCores();
this.containersAllocated = metrics.getAllocatedContainers(); this.containersAllocated = metrics.getAllocatedContainers();
this.containersPending = metrics.getPendingContainers(); this.containersPending = metrics.getPendingContainers();
this.containersReserved = metrics.getReservedContainers(); this.containersReserved = metrics.getReservedContainers();
this.totalMB = new ResourceInfo(rs.getClusterResource()).getMemorySize();
this.totalVirtualCores = new ResourceInfo(rs.getClusterResource()).getvCores();
if (rs instanceof CapacityScheduler) { if (rs instanceof CapacityScheduler) {
CapacityScheduler cs = (CapacityScheduler) rs; CapacityScheduler cs = (CapacityScheduler) rs;
this.totalMB = availableMB + allocatedMB + reservedMB;
this.totalVirtualCores =
availableVirtualCores + allocatedVirtualCores + reservedVirtualCores;
// TODO, add support of other schedulers to get total used resources // TODO, add support of other schedulers to get total used resources
// across partition. // across partition.
if (cs.getRootQueue() != null if (cs.getRootQueue() != null && cs.getRootQueue().getQueueResourceUsage() != null
&& cs.getRootQueue().getQueueResourceUsage() != null
&& cs.getRootQueue().getQueueResourceUsage().getAllUsed() != null) { && cs.getRootQueue().getQueueResourceUsage().getAllUsed() != null) {
totalUsedResourcesAcrossPartition = new ResourceInfo( totalUsedResourcesAcrossPartition =
cs.getRootQueue().getQueueResourceUsage().getAllUsed()); new ResourceInfo(cs.getRootQueue().getQueueResourceUsage().getAllUsed());
totalClusterResourcesAcrossPartition = new ResourceInfo( totalClusterResourcesAcrossPartition = new ResourceInfo(cs.getClusterResource());
cs.getClusterResource()); totalReservedResourcesAcrossPartition =
totalReservedResourcesAcrossPartition = new ResourceInfo( new ResourceInfo(cs.getRootQueue().getQueueResourceUsage().getAllReserved());
cs.getRootQueue().getQueueResourceUsage().getAllReserved());
totalAllocatedContainersAcrossPartition = totalAllocatedContainersAcrossPartition =
((ParentQueue) cs.getRootQueue()).getNumContainers(); ((ParentQueue) cs.getRootQueue()).getNumContainers();
crossPartitionMetricsAvailable = true; crossPartitionMetricsAvailable = true;
this.allocatedMB = totalUsedResourcesAcrossPartition.getMemorySize();
this.allocatedVirtualCores = totalUsedResourcesAcrossPartition.getvCores();
this.reservedMB = totalReservedResourcesAcrossPartition.getMemorySize();
this.reservedVirtualCores = totalReservedResourcesAcrossPartition.getvCores();
} }
this.availableMB = this.totalMB - this.allocatedMB;
this.availableVirtualCores = this.totalVirtualCores - this.allocatedVirtualCores;
} else if (rs instanceof FairScheduler) {
FairScheduler fs = (FairScheduler) rs;
if (fs.getQueueManager().getRootQueue() != null) {
FSParentQueue rootQueue = fs.getQueueManager().getRootQueue();
this.allocatedMB = rootQueue.getResourceUsage().getMemorySize();
this.allocatedVirtualCores = rootQueue.getResourceUsage().getVirtualCores();
this.reservedMB = rootQueue.getReservedResource().getMemorySize();
this.reservedVirtualCores = rootQueue.getReservedResource().getVirtualCores();
}
this.availableMB = this.totalMB - this.allocatedMB;
this.availableVirtualCores = this.totalVirtualCores - this.allocatedVirtualCores;
} else { } else {
this.reservedMB = metrics.getReservedMB();
this.availableMB = metrics.getAvailableMB();
this.allocatedMB = metrics.getAllocatedMB();
this.reservedVirtualCores = metrics.getReservedVirtualCores();
this.availableVirtualCores = metrics.getAvailableVirtualCores();
this.allocatedVirtualCores = metrics.getAllocatedVirtualCores();
this.totalMB = availableMB + allocatedMB; this.totalMB = availableMB + allocatedMB;
this.totalVirtualCores = availableVirtualCores + allocatedVirtualCores; this.totalVirtualCores = availableVirtualCores + allocatedVirtualCores;
} }
long baseMem = this.totalMB; long baseMem = this.totalMB;
this.utilizedMBPercent = baseMem <= 0 ? 0 : this.utilizedMBPercent = baseMem <= 0 ? 0 :
(int) (clusterMetrics.getUtilizedMB() * 100 / baseMem); (int) (clusterMetrics.getUtilizedMB() * 100 / baseMem);