YARN-3272. Surface container locality info in RM web UI (Jian He via wangda)

This commit is contained in:
Wangda Tan 2015-03-03 11:49:01 -08:00
parent 1004473aa6
commit e17e5ba9d7
8 changed files with 163 additions and 40 deletions

View File

@ -348,6 +348,9 @@ Release 2.7.0 - UNRELEASED
YARN-3281. Added RMStateStore to StateMachine visualization list. YARN-3281. Added RMStateStore to StateMachine visualization list.
(Chengbing Liu via jianhe) (Chengbing Liu via jianhe)
YARN-3272. Surface container locality info in RM web UI.
(Jian He via wangda)
OPTIMIZATIONS OPTIMIZATIONS
YARN-2990. FairScheduler's delay-scheduling always waits for node-local and YARN-2990. FairScheduler's delay-scheduling always waits for node-local and

View File

@ -62,6 +62,13 @@
<Class name="~org\.apache\.hadoop\.yarn\.server\.resourcemanager\.rmapp\.attempt\.RMAppAttemptImpl.*" /> <Class name="~org\.apache\.hadoop\.yarn\.server\.resourcemanager\.rmapp\.attempt\.RMAppAttemptImpl.*" />
<Bug pattern="BC_UNCONFIRMED_CAST" /> <Bug pattern="BC_UNCONFIRMED_CAST" />
</Match> </Match>
<Match>
<Class name="~org\.apache\.hadoop\.yarn\.server\.resourcemanager\.rmapp\.attempt\.RMAppAttemptMetrics" />
<Method name="getLocalityStatistics" />
<Bug pattern="EI_EXPOSE_REP" />
<Method name="incNumAllocatedContainers"/>
<Bug pattern="VO_VOLATILE_INCREMENT" />
</Match>
<Match> <Match>
<Class name="org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$AppRejectedTransition" /> <Class name="org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$AppRejectedTransition" />
<Bug pattern="BC_UNCONFIRMED_CAST" /> <Bug pattern="BC_UNCONFIRMED_CAST" />

View File

@ -32,6 +32,7 @@
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
public class RMAppAttemptMetrics { public class RMAppAttemptMetrics {
@ -49,6 +50,10 @@ public class RMAppAttemptMetrics {
private AtomicLong finishedVcoreSeconds = new AtomicLong(0); private AtomicLong finishedVcoreSeconds = new AtomicLong(0);
private RMContext rmContext; private RMContext rmContext;
private int[][] localityStatistics =
new int[NodeType.values().length][NodeType.values().length];
private volatile int totalAllocatedContainers;
public RMAppAttemptMetrics(ApplicationAttemptId attemptId, public RMAppAttemptMetrics(ApplicationAttemptId attemptId,
RMContext rmContext) { RMContext rmContext) {
this.attemptId = attemptId; this.attemptId = attemptId;
@ -126,4 +131,18 @@ public void updateAggregateAppResourceUsage(long finishedMemorySeconds,
this.finishedMemorySeconds.addAndGet(finishedMemorySeconds); this.finishedMemorySeconds.addAndGet(finishedMemorySeconds);
this.finishedVcoreSeconds.addAndGet(finishedVcoreSeconds); this.finishedVcoreSeconds.addAndGet(finishedVcoreSeconds);
} }
public void incNumAllocatedContainers(NodeType containerType,
NodeType requestType) {
localityStatistics[containerType.index][requestType.index]++;
totalAllocatedContainers++;
}
public int[][] getLocalityStatistics() {
return this.localityStatistics;
}
public int getTotalAllocatedContainers() {
return this.totalAllocatedContainers;
}
} }

View File

@ -22,7 +22,10 @@
* Resource classification. * Resource classification.
*/ */
public enum NodeType { public enum NodeType {
NODE_LOCAL, NODE_LOCAL(0), RACK_LOCAL(1), OFF_SWITCH(2);
RACK_LOCAL, public int index;
OFF_SWITCH
private NodeType(int index) {
this.index = index;
}
} }

View File

@ -46,6 +46,7 @@
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AggregateAppResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.AggregateAppResourceUsage;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEvent;
@ -78,7 +79,7 @@ public class SchedulerApplicationAttempt {
private long lastVcoreSeconds = 0; private long lastVcoreSeconds = 0;
protected final AppSchedulingInfo appSchedulingInfo; protected final AppSchedulingInfo appSchedulingInfo;
protected ApplicationAttemptId attemptId;
protected Map<ContainerId, RMContainer> liveContainers = protected Map<ContainerId, RMContainer> liveContainers =
new HashMap<ContainerId, RMContainer>(); new HashMap<ContainerId, RMContainer>();
protected final Map<Priority, Map<NodeId, RMContainer>> reservedContainers = protected final Map<Priority, Map<NodeId, RMContainer>> reservedContainers =
@ -132,6 +133,7 @@ public SchedulerApplicationAttempt(ApplicationAttemptId applicationAttemptId,
activeUsersManager, rmContext.getEpoch()); activeUsersManager, rmContext.getEpoch());
this.queue = queue; this.queue = queue;
this.pendingRelease = new HashSet<ContainerId>(); this.pendingRelease = new HashSet<ContainerId>();
this.attemptId = applicationAttemptId;
if (rmContext.getRMApps() != null && if (rmContext.getRMApps() != null &&
rmContext.getRMApps() rmContext.getRMApps()
.containsKey(applicationAttemptId.getApplicationId())) { .containsKey(applicationAttemptId.getApplicationId())) {
@ -619,4 +621,15 @@ public synchronized void recoverContainer(RMContainer rmContainer) {
// schedulingOpportunities // schedulingOpportunities
// lastScheduledContainer // lastScheduledContainer
} }
public void incNumAllocatedContainers(NodeType containerType,
NodeType requestType) {
RMAppAttempt attempt =
rmContext.getRMApps().get(attemptId.getApplicationId())
.getCurrentAppAttempt();
if (attempt != null) {
attempt.getRMAppAttemptMetrics().incNumAllocatedContainers(containerType,
requestType);
}
}
} }

View File

@ -32,6 +32,7 @@
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.mutable.MutableObject;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceAudience.Private;
@ -1242,15 +1243,25 @@ private CSAssignment assignContainersOnNode(Resource clusterResource,
RMContainer reservedContainer, boolean needToUnreserve) { RMContainer reservedContainer, boolean needToUnreserve) {
Resource assigned = Resources.none(); Resource assigned = Resources.none();
NodeType requestType = null;
MutableObject allocatedContainer = new MutableObject();
// Data-local // Data-local
ResourceRequest nodeLocalResourceRequest = ResourceRequest nodeLocalResourceRequest =
application.getResourceRequest(priority, node.getNodeName()); application.getResourceRequest(priority, node.getNodeName());
if (nodeLocalResourceRequest != null) { if (nodeLocalResourceRequest != null) {
requestType = NodeType.NODE_LOCAL;
assigned = assigned =
assignNodeLocalContainers(clusterResource, nodeLocalResourceRequest, assignNodeLocalContainers(clusterResource, nodeLocalResourceRequest,
node, application, priority, reservedContainer, needToUnreserve); node, application, priority, reservedContainer, needToUnreserve,
allocatedContainer);
if (Resources.greaterThan(resourceCalculator, clusterResource, if (Resources.greaterThan(resourceCalculator, clusterResource,
assigned, Resources.none())) { assigned, Resources.none())) {
//update locality statistics
if (allocatedContainer.getValue() != null) {
application.incNumAllocatedContainers(NodeType.NODE_LOCAL,
requestType);
}
return new CSAssignment(assigned, NodeType.NODE_LOCAL); return new CSAssignment(assigned, NodeType.NODE_LOCAL);
} }
} }
@ -1263,11 +1274,22 @@ private CSAssignment assignContainersOnNode(Resource clusterResource,
return SKIP_ASSIGNMENT; return SKIP_ASSIGNMENT;
} }
if (requestType != NodeType.NODE_LOCAL) {
requestType = NodeType.RACK_LOCAL;
}
assigned = assigned =
assignRackLocalContainers(clusterResource, rackLocalResourceRequest, assignRackLocalContainers(clusterResource, rackLocalResourceRequest,
node, application, priority, reservedContainer, needToUnreserve); node, application, priority, reservedContainer, needToUnreserve,
allocatedContainer);
if (Resources.greaterThan(resourceCalculator, clusterResource, if (Resources.greaterThan(resourceCalculator, clusterResource,
assigned, Resources.none())) { assigned, Resources.none())) {
//update locality statistics
if (allocatedContainer.getValue() != null) {
application.incNumAllocatedContainers(NodeType.RACK_LOCAL,
requestType);
}
return new CSAssignment(assigned, NodeType.RACK_LOCAL); return new CSAssignment(assigned, NodeType.RACK_LOCAL);
} }
} }
@ -1279,11 +1301,21 @@ private CSAssignment assignContainersOnNode(Resource clusterResource,
if (!offSwitchResourceRequest.getRelaxLocality()) { if (!offSwitchResourceRequest.getRelaxLocality()) {
return SKIP_ASSIGNMENT; return SKIP_ASSIGNMENT;
} }
if (requestType != NodeType.NODE_LOCAL
&& requestType != NodeType.RACK_LOCAL) {
requestType = NodeType.OFF_SWITCH;
}
return new CSAssignment(assignOffSwitchContainers(clusterResource, assigned =
offSwitchResourceRequest, node, application, priority, assignOffSwitchContainers(clusterResource, offSwitchResourceRequest,
reservedContainer, needToUnreserve), node, application, priority, reservedContainer, needToUnreserve,
NodeType.OFF_SWITCH); allocatedContainer);
// update locality statistics
if (allocatedContainer.getValue() != null) {
application.incNumAllocatedContainers(NodeType.OFF_SWITCH, requestType);
}
return new CSAssignment(assigned, NodeType.OFF_SWITCH);
} }
return SKIP_ASSIGNMENT; return SKIP_ASSIGNMENT;
@ -1370,40 +1402,43 @@ protected boolean checkLimitsToReserve(Resource clusterResource,
private Resource assignNodeLocalContainers(Resource clusterResource, private Resource assignNodeLocalContainers(Resource clusterResource,
ResourceRequest nodeLocalResourceRequest, FiCaSchedulerNode node, ResourceRequest nodeLocalResourceRequest, FiCaSchedulerNode node,
FiCaSchedulerApp application, Priority priority, FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, boolean needToUnreserve) { RMContainer reservedContainer, boolean needToUnreserve,
MutableObject allocatedContainer) {
if (canAssign(application, priority, node, NodeType.NODE_LOCAL, if (canAssign(application, priority, node, NodeType.NODE_LOCAL,
reservedContainer)) { reservedContainer)) {
return assignContainer(clusterResource, node, application, priority, return assignContainer(clusterResource, node, application, priority,
nodeLocalResourceRequest, NodeType.NODE_LOCAL, reservedContainer, nodeLocalResourceRequest, NodeType.NODE_LOCAL, reservedContainer,
needToUnreserve); needToUnreserve, allocatedContainer);
} }
return Resources.none(); return Resources.none();
} }
private Resource assignRackLocalContainers(Resource clusterResource, private Resource assignRackLocalContainers(
ResourceRequest rackLocalResourceRequest, FiCaSchedulerNode node, Resource clusterResource, ResourceRequest rackLocalResourceRequest,
FiCaSchedulerApp application, Priority priority, FiCaSchedulerNode node, FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, boolean needToUnreserve) { RMContainer reservedContainer, boolean needToUnreserve,
MutableObject allocatedContainer) {
if (canAssign(application, priority, node, NodeType.RACK_LOCAL, if (canAssign(application, priority, node, NodeType.RACK_LOCAL,
reservedContainer)) { reservedContainer)) {
return assignContainer(clusterResource, node, application, priority, return assignContainer(clusterResource, node, application, priority,
rackLocalResourceRequest, NodeType.RACK_LOCAL, reservedContainer, rackLocalResourceRequest, NodeType.RACK_LOCAL, reservedContainer,
needToUnreserve); needToUnreserve, allocatedContainer);
} }
return Resources.none(); return Resources.none();
} }
private Resource assignOffSwitchContainers(Resource clusterResource, private Resource assignOffSwitchContainers(
ResourceRequest offSwitchResourceRequest, FiCaSchedulerNode node, Resource clusterResource, ResourceRequest offSwitchResourceRequest,
FiCaSchedulerApp application, Priority priority, FiCaSchedulerNode node, FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, boolean needToUnreserve) { RMContainer reservedContainer, boolean needToUnreserve,
MutableObject allocatedContainer) {
if (canAssign(application, priority, node, NodeType.OFF_SWITCH, if (canAssign(application, priority, node, NodeType.OFF_SWITCH,
reservedContainer)) { reservedContainer)) {
return assignContainer(clusterResource, node, application, priority, return assignContainer(clusterResource, node, application, priority,
offSwitchResourceRequest, NodeType.OFF_SWITCH, reservedContainer, offSwitchResourceRequest, NodeType.OFF_SWITCH, reservedContainer,
needToUnreserve); needToUnreserve, allocatedContainer);
} }
return Resources.none(); return Resources.none();
@ -1487,7 +1522,7 @@ Container createContainer(FiCaSchedulerApp application, FiCaSchedulerNode node,
private Resource assignContainer(Resource clusterResource, FiCaSchedulerNode node, private Resource assignContainer(Resource clusterResource, FiCaSchedulerNode node,
FiCaSchedulerApp application, Priority priority, FiCaSchedulerApp application, Priority priority,
ResourceRequest request, NodeType type, RMContainer rmContainer, ResourceRequest request, NodeType type, RMContainer rmContainer,
boolean needToUnreserve) { boolean needToUnreserve, MutableObject createdContainer) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("assignContainers: node=" + node.getNodeName() LOG.debug("assignContainers: node=" + node.getNodeName()
+ " application=" + application.getApplicationId() + " application=" + application.getApplicationId()
@ -1592,7 +1627,7 @@ private Resource assignContainer(Resource clusterResource, FiCaSchedulerNode nod
" container=" + container + " container=" + container +
" queue=" + this + " queue=" + this +
" clusterResource=" + clusterResource); " clusterResource=" + clusterResource);
createdContainer.setValue(allocatedContainer);
return container.getResource(); return container.getResource();
} else { } else {
// if we are allowed to allocate but this node doesn't have space, reserve it or // if we are allowed to allocate but this node doesn't have space, reserve it or

View File

@ -204,9 +204,46 @@ protected void render(Block html) {
table._(); table._();
div._(); div._();
createContainerLocalityTable(html, attemptMetrics);
createResourceRequestsTable(html, app); createResourceRequestsTable(html, app);
} }
private void createContainerLocalityTable(Block html,
RMAppAttemptMetrics attemptMetrics) {
if (attemptMetrics == null) {
return;
}
DIV<Hamlet> div = html.div(_INFO_WRAP);
TABLE<DIV<Hamlet>> table =
div.h3(
"Total Allocated Containers: "
+ attemptMetrics.getTotalAllocatedContainers()).h3("Each table cell"
+ " represents the number of NodeLocal/RackLocal/OffSwitch containers"
+ " satisfied by NodeLocal/RackLocal/OffSwitch resource requests.").table(
"#containerLocality");
table.
tr().
th(_TH, "").
th(_TH, "Node Local Request").
th(_TH, "Rack Local Request").
th(_TH, "Off Switch Request").
_();
String[] containersType =
{ "Num Node Local Containers (satisfied by)", "Num Rack Local Containers (satisfied by)",
"Num Off Switch Containers (satisfied by)" };
boolean odd = false;
for (int i = 0; i < attemptMetrics.getLocalityStatistics().length; i++) {
table.tr((odd = !odd) ? _ODD : _EVEN).td(containersType[i])
.td(String.valueOf(attemptMetrics.getLocalityStatistics()[i][0]))
.td(i == 0 ? "" : String.valueOf(attemptMetrics.getLocalityStatistics()[i][1]))
.td(i <= 1 ? "" : String.valueOf(attemptMetrics.getLocalityStatistics()[i][2]))._();
}
table._();
div._();
}
private void createResourceRequestsTable(Block html, AppInfo app) { private void createResourceRequestsTable(Block html, AppInfo app) {
TBODY<TABLE<Hamlet>> tbody = TBODY<TABLE<Hamlet>> tbody =
html.table("#ResourceRequests").thead().tr() html.table("#ResourceRequests").thead().tr()

View File

@ -52,6 +52,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.ahs.RMApplicationHistoryWriter; import org.apache.hadoop.yarn.server.resourcemanager.ahs.RMApplicationHistoryWriter;
import org.apache.hadoop.yarn.server.resourcemanager.metrics.SystemMetricsPublisher; import org.apache.hadoop.yarn.server.resourcemanager.metrics.SystemMetricsPublisher;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
@ -218,6 +219,7 @@ public void testReservation() throws Exception {
.getMockApplicationAttemptId(0, 0); .getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a,
mock(ActiveUsersManager.class), spyRMContext); mock(ActiveUsersManager.class), spyRMContext);
rmContext.getRMApps().put(app_0.getApplicationId(), mock(RMApp.class));
a.submitApplicationAttempt(app_0, user_0); a.submitApplicationAttempt(app_0, user_0);
@ -373,6 +375,7 @@ public void testReservationNoContinueLook() throws Exception {
.getMockApplicationAttemptId(0, 0); .getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a,
mock(ActiveUsersManager.class), spyRMContext); mock(ActiveUsersManager.class), spyRMContext);
rmContext.getRMApps().put(app_0.getApplicationId(), mock(RMApp.class));
a.submitApplicationAttempt(app_0, user_0); a.submitApplicationAttempt(app_0, user_0);
@ -524,6 +527,7 @@ public void testAssignContainersNeedToUnreserve() throws Exception {
.getMockApplicationAttemptId(0, 0); .getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a,
mock(ActiveUsersManager.class), spyRMContext); mock(ActiveUsersManager.class), spyRMContext);
rmContext.getRMApps().put(app_0.getApplicationId(), mock(RMApp.class));
a.submitApplicationAttempt(app_0, user_0); a.submitApplicationAttempt(app_0, user_0);
@ -765,6 +769,7 @@ public void testAssignToQueue() throws Exception {
.getMockApplicationAttemptId(0, 0); .getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a,
mock(ActiveUsersManager.class), spyRMContext); mock(ActiveUsersManager.class), spyRMContext);
rmContext.getRMApps().put(app_0.getApplicationId(), mock(RMApp.class));
a.submitApplicationAttempt(app_0, user_0); a.submitApplicationAttempt(app_0, user_0);
@ -943,7 +948,7 @@ public void testAssignToUser() throws Exception {
.getMockApplicationAttemptId(0, 0); .getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a,
mock(ActiveUsersManager.class), spyRMContext); mock(ActiveUsersManager.class), spyRMContext);
rmContext.getRMApps().put(app_0.getApplicationId(), mock(RMApp.class));
a.submitApplicationAttempt(app_0, user_0); a.submitApplicationAttempt(app_0, user_0);
final ApplicationAttemptId appAttemptId_1 = TestUtils final ApplicationAttemptId appAttemptId_1 = TestUtils
@ -1073,6 +1078,7 @@ public void testReservationsNoneAvailable() throws Exception {
.getMockApplicationAttemptId(0, 0); .getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a,
mock(ActiveUsersManager.class), spyRMContext); mock(ActiveUsersManager.class), spyRMContext);
rmContext.getRMApps().put(app_0.getApplicationId(), mock(RMApp.class));
a.submitApplicationAttempt(app_0, user_0); a.submitApplicationAttempt(app_0, user_0);