YARN-4712. CPU Usage Metric is not captured properly in YARN-2928. (Naganarasimha G R via Varun Saxena)
This commit is contained in:
parent
9bdd455dce
commit
6f6cc647d6
|
@ -22,7 +22,6 @@ import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
|
@ -32,11 +31,10 @@ import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
|
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
|
@ -47,6 +45,7 @@ import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
||||||
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
||||||
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
|
|
||||||
public class ContainersMonitorImpl extends AbstractService implements
|
public class ContainersMonitorImpl extends AbstractService implements
|
||||||
|
@ -576,8 +575,8 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
NMTimelinePublisher nmMetricsPublisher =
|
NMTimelinePublisher nmMetricsPublisher =
|
||||||
container.getNMTimelinePublisher();
|
container.getNMTimelinePublisher();
|
||||||
if (nmMetricsPublisher != null) {
|
if (nmMetricsPublisher != null) {
|
||||||
nmMetricsPublisher.reportContainerResourceUsage(container, pId,
|
nmMetricsPublisher.reportContainerResourceUsage(container,
|
||||||
currentPmemUsage, cpuUsageTotalCoresPercentage);
|
currentPmemUsage, cpuUsagePercentPerCore);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// Log the exception and proceed to the next container.
|
// Log the exception and proceed to the next container.
|
||||||
|
|
|
@ -113,29 +113,28 @@ public class NMTimelinePublisher extends CompositeService {
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public void reportContainerResourceUsage(Container container, String pId,
|
public void reportContainerResourceUsage(Container container, Long pmemUsage,
|
||||||
Long pmemUsage, Float cpuUsageTotalCoresPercentage) {
|
Float cpuUsagePercentPerCore) {
|
||||||
if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE ||
|
if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE ||
|
||||||
cpuUsageTotalCoresPercentage !=
|
cpuUsagePercentPerCore != ResourceCalculatorProcessTree.UNAVAILABLE) {
|
||||||
ResourceCalculatorProcessTree.UNAVAILABLE) {
|
|
||||||
ContainerEntity entity =
|
ContainerEntity entity =
|
||||||
createContainerEntity(container.getContainerId());
|
createContainerEntity(container.getContainerId());
|
||||||
long currentTimeMillis = System.currentTimeMillis();
|
long currentTimeMillis = System.currentTimeMillis();
|
||||||
if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE) {
|
if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE) {
|
||||||
TimelineMetric memoryMetric = new TimelineMetric();
|
TimelineMetric memoryMetric = new TimelineMetric();
|
||||||
memoryMetric.setId(ContainerMetric.MEMORY.toString() + pId);
|
memoryMetric.setId(ContainerMetric.MEMORY.toString());
|
||||||
memoryMetric.addValue(currentTimeMillis, pmemUsage);
|
memoryMetric.addValue(currentTimeMillis, pmemUsage);
|
||||||
entity.addMetric(memoryMetric);
|
entity.addMetric(memoryMetric);
|
||||||
}
|
}
|
||||||
if (cpuUsageTotalCoresPercentage !=
|
if (cpuUsagePercentPerCore != ResourceCalculatorProcessTree.UNAVAILABLE) {
|
||||||
ResourceCalculatorProcessTree.UNAVAILABLE) {
|
|
||||||
TimelineMetric cpuMetric = new TimelineMetric();
|
TimelineMetric cpuMetric = new TimelineMetric();
|
||||||
cpuMetric.setId(ContainerMetric.CPU.toString() + pId);
|
cpuMetric.setId(ContainerMetric.CPU.toString());
|
||||||
cpuMetric.addValue(currentTimeMillis, cpuUsageTotalCoresPercentage);
|
cpuMetric.addValue(currentTimeMillis,
|
||||||
|
Math.round(cpuUsagePercentPerCore));
|
||||||
entity.addMetric(cpuMetric);
|
entity.addMetric(cpuMetric);
|
||||||
}
|
}
|
||||||
dispatcher.getEventHandler().handle(
|
dispatcher.getEventHandler()
|
||||||
new TimelinePublishEvent(entity, container.getContainerId()
|
.handle(new TimelinePublishEvent(entity, container.getContainerId()
|
||||||
.getApplicationAttemptId().getApplicationId()));
|
.getApplicationAttemptId().getApplicationId()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,157 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.timelineservice;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
|
||||||
|
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineMetric;
|
||||||
|
import org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
|
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestNMTimelinePublisher {
|
||||||
|
private static final String MEMORY_ID = "MEMORY";
|
||||||
|
private static final String CPU_ID = "CPU";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContainerResourceUsage() {
|
||||||
|
Context context = mock(Context.class);
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
ConcurrentMap<ApplicationId, Application> map = mock(ConcurrentMap.class);
|
||||||
|
Application aApp = mock(Application.class);
|
||||||
|
when(map.get(any(ApplicationId.class))).thenReturn(aApp);
|
||||||
|
DummyTimelineClient timelineClient = new DummyTimelineClient();
|
||||||
|
when(aApp.getTimelineClient()).thenReturn(timelineClient);
|
||||||
|
when(context.getApplications()).thenReturn(map);
|
||||||
|
when(context.getNodeId()).thenReturn(NodeId.newInstance("localhost", 0));
|
||||||
|
when(context.getHttpPort()).thenReturn(0);
|
||||||
|
NMTimelinePublisher publisher = new NMTimelinePublisher(context);
|
||||||
|
publisher.init(new Configuration());
|
||||||
|
publisher.start();
|
||||||
|
Container aContainer = mock(Container.class);
|
||||||
|
when(aContainer.getContainerId()).thenReturn(ContainerId.newContainerId(
|
||||||
|
ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1),
|
||||||
|
0L));
|
||||||
|
publisher.reportContainerResourceUsage(aContainer, 1024L, 8F);
|
||||||
|
verifyPublishedResourceUsageMetrics(timelineClient, 1024L, 8);
|
||||||
|
timelineClient.reset();
|
||||||
|
|
||||||
|
publisher.reportContainerResourceUsage(aContainer, 1024L, 0.8F);
|
||||||
|
verifyPublishedResourceUsageMetrics(timelineClient, 1024L, 1);
|
||||||
|
timelineClient.reset();
|
||||||
|
|
||||||
|
publisher.reportContainerResourceUsage(aContainer, 1024L, 0.49F);
|
||||||
|
verifyPublishedResourceUsageMetrics(timelineClient, 1024L, 0);
|
||||||
|
timelineClient.reset();
|
||||||
|
|
||||||
|
publisher.reportContainerResourceUsage(aContainer, 1024L,
|
||||||
|
(float) ResourceCalculatorProcessTree.UNAVAILABLE);
|
||||||
|
verifyPublishedResourceUsageMetrics(timelineClient, 1024L,
|
||||||
|
ResourceCalculatorProcessTree.UNAVAILABLE);
|
||||||
|
publisher.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyPublishedResourceUsageMetrics(
|
||||||
|
DummyTimelineClient timelineClient, long memoryUsage, int cpuUsage) {
|
||||||
|
TimelineEntity[] entities = null;
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
entities = timelineClient.getLastPublishedEntities();
|
||||||
|
if (entities != null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Thread.sleep(150L);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int numberOfResourceMetrics = 0;
|
||||||
|
numberOfResourceMetrics +=
|
||||||
|
(memoryUsage == ResourceCalculatorProcessTree.UNAVAILABLE) ? 0 : 1;
|
||||||
|
numberOfResourceMetrics +=
|
||||||
|
(cpuUsage == ResourceCalculatorProcessTree.UNAVAILABLE) ? 0 : 1;
|
||||||
|
assertNotNull("entities are expected to be published", entities);
|
||||||
|
assertEquals("Expected number of metrics notpublished",
|
||||||
|
numberOfResourceMetrics, entities[0].getMetrics().size());
|
||||||
|
Iterator<TimelineMetric> metrics = entities[0].getMetrics().iterator();
|
||||||
|
while (metrics.hasNext()) {
|
||||||
|
TimelineMetric metric = metrics.next();
|
||||||
|
Iterator<Entry<Long, Number>> entrySet;
|
||||||
|
switch (metric.getId()) {
|
||||||
|
case CPU_ID:
|
||||||
|
if (cpuUsage == ResourceCalculatorProcessTree.UNAVAILABLE) {
|
||||||
|
Assert.fail("Not Expecting CPU Metric to be published");
|
||||||
|
}
|
||||||
|
entrySet = metric.getValues().entrySet().iterator();
|
||||||
|
assertEquals("CPU usage metric not matching", cpuUsage,
|
||||||
|
entrySet.next().getValue());
|
||||||
|
break;
|
||||||
|
case MEMORY_ID:
|
||||||
|
if (memoryUsage == ResourceCalculatorProcessTree.UNAVAILABLE) {
|
||||||
|
Assert.fail("Not Expecting Memory Metric to be published");
|
||||||
|
}
|
||||||
|
entrySet = metric.getValues().entrySet().iterator();
|
||||||
|
assertEquals("Memory usage metric not matching", memoryUsage,
|
||||||
|
entrySet.next().getValue());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
Assert.fail("Invalid Resource Usage metric");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static class DummyTimelineClient extends TimelineClientImpl {
|
||||||
|
private TimelineEntity[] lastPublishedEntities;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void putEntities(TimelineEntity... entities)
|
||||||
|
throws IOException, YarnException {
|
||||||
|
this.lastPublishedEntities = entities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TimelineEntity[] getLastPublishedEntities() {
|
||||||
|
return lastPublishedEntities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
lastPublishedEntities = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue