YARN-6735. Have a way to turn off container metrics from NMs. Contributed by Abhishek Modi.

This commit is contained in:
Rohith Sharma K S 2019-02-05 13:47:56 +05:30
parent ba38db4f5b
commit e3ec18b0c4
5 changed files with 169 additions and 129 deletions

View File

@ -1226,6 +1226,16 @@ public static boolean isAclEnabled(Configuration conf) {
public static final String DEFAULT_NM_COLLECTOR_SERVICE_ADDRESS = public static final String DEFAULT_NM_COLLECTOR_SERVICE_ADDRESS =
"0.0.0.0:" + DEFAULT_NM_COLLECTOR_SERVICE_PORT; "0.0.0.0:" + DEFAULT_NM_COLLECTOR_SERVICE_PORT;
/**
* The setting that controls whether yarn container events are published to
* the timeline service or not by NM. This configuration setting is for ATS
* V2
*/
public static final String NM_PUBLISH_CONTAINER_EVENTS_ENABLED = NM_PREFIX
+ "emit-container-events";
public static final boolean DEFAULT_NM_PUBLISH_CONTAINER_EVENTS_ENABLED =
true;
/** Interval in between cache cleanups.*/ /** Interval in between cache cleanups.*/
public static final String NM_LOCALIZER_CACHE_CLEANUP_INTERVAL_MS = public static final String NM_LOCALIZER_CACHE_CLEANUP_INTERVAL_MS =
NM_PREFIX + "localizer.cache.cleanup.interval-ms"; NM_PREFIX + "localizer.cache.cleanup.interval-ms";

View File

@ -1190,6 +1190,14 @@
<value>${yarn.nodemanager.hostname}:8048</value> <value>${yarn.nodemanager.hostname}:8048</value>
</property> </property>
<property>
<description>The setting that controls whether yarn container events are
published to the timeline service or not by NM. This configuration setting
is for ATS V2.</description>
<name>yarn.nodemanager.emit-container-events</name>
<value>true</value>
</property>
<property> <property>
<description>Interval in between cache cleanups.</description> <description>Interval in between cache cleanups.</description>
<name>yarn.nodemanager.localizer.cache.cleanup.interval-ms</name> <name>yarn.nodemanager.localizer.cache.cleanup.interval-ms</name>

View File

@ -24,6 +24,7 @@
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.webapp.util.WebAppUtils; import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -88,6 +89,8 @@ public class NMTimelinePublisher extends CompositeService {
private final Map<ApplicationId, TimelineV2Client> appToClientMap; private final Map<ApplicationId, TimelineV2Client> appToClientMap;
private boolean publishNMContainerEvents = true;
public NMTimelinePublisher(Context context) { public NMTimelinePublisher(Context context) {
super(NMTimelinePublisher.class.getName()); super(NMTimelinePublisher.class.getName());
this.context = context; this.context = context;
@ -110,6 +113,10 @@ protected void serviceInit(Configuration conf) throws Exception {
if (webAppURLWithoutScheme.contains(":")) { if (webAppURLWithoutScheme.contains(":")) {
httpPort = webAppURLWithoutScheme.split(":")[1]; httpPort = webAppURLWithoutScheme.split(":")[1];
} }
publishNMContainerEvents = conf.getBoolean(
YarnConfiguration.NM_PUBLISH_CONTAINER_EVENTS_ENABLED,
YarnConfiguration.DEFAULT_NM_PUBLISH_CONTAINER_EVENTS_ENABLED);
super.serviceInit(conf); super.serviceInit(conf);
} }
@ -155,8 +162,10 @@ protected void handleNMTimelineEvent(NMTimelineEvent event) {
public void reportContainerResourceUsage(Container container, Long pmemUsage, public void reportContainerResourceUsage(Container container, Long pmemUsage,
Float cpuUsagePercentPerCore) { Float cpuUsagePercentPerCore) {
if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE || if (publishNMContainerEvents) {
cpuUsagePercentPerCore != ResourceCalculatorProcessTree.UNAVAILABLE) { if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE
|| cpuUsagePercentPerCore !=
ResourceCalculatorProcessTree.UNAVAILABLE) {
ContainerEntity entity = ContainerEntity entity =
createContainerEntity(container.getContainerId()); createContainerEntity(container.getContainerId());
long currentTimeMillis = System.currentTimeMillis(); long currentTimeMillis = System.currentTimeMillis();
@ -167,7 +176,8 @@ public void reportContainerResourceUsage(Container container, Long pmemUsage,
memoryMetric.addValue(currentTimeMillis, pmemUsage); memoryMetric.addValue(currentTimeMillis, pmemUsage);
entity.addMetric(memoryMetric); entity.addMetric(memoryMetric);
} }
if (cpuUsagePercentPerCore != ResourceCalculatorProcessTree.UNAVAILABLE) { if (cpuUsagePercentPerCore !=
ResourceCalculatorProcessTree.UNAVAILABLE) {
TimelineMetric cpuMetric = new TimelineMetric(); TimelineMetric cpuMetric = new TimelineMetric();
cpuMetric.setId(ContainerMetric.CPU.toString()); cpuMetric.setId(ContainerMetric.CPU.toString());
// TODO: support average // TODO: support average
@ -178,31 +188,35 @@ public void reportContainerResourceUsage(Container container, Long pmemUsage,
} }
entity.setIdPrefix(TimelineServiceHelper. entity.setIdPrefix(TimelineServiceHelper.
invertLong(container.getContainerStartTime())); invertLong(container.getContainerStartTime()));
ApplicationId appId = container.getContainerId().getApplicationAttemptId() ApplicationId appId = container.getContainerId().
.getApplicationId(); getApplicationAttemptId().getApplicationId();
try { try {
// no need to put it as part of publisher as timeline client already has // no need to put it as part of publisher as timeline client
// Queuing concept // already has Queuing concept
TimelineV2Client timelineClient = getTimelineClient(appId); TimelineV2Client timelineClient = getTimelineClient(appId);
if (timelineClient != null) { if (timelineClient != null) {
timelineClient.putEntitiesAsync(entity); timelineClient.putEntitiesAsync(entity);
} else { } else {
LOG.error("Seems like client has been removed before the container" LOG.error("Seems like client has been removed before the container"
+ " metric could be published for " + container.getContainerId()); + " metric could be published for " +
container.getContainerId());
} }
} catch (IOException e) { } catch (IOException e) {
LOG.error("Failed to publish Container metrics for container " LOG.error(
+ container.getContainerId()); "Failed to publish Container metrics for container " +
container.getContainerId());
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Failed to publish Container metrics for container " LOG.debug("Failed to publish Container metrics for container " +
+ container.getContainerId(), e); container.getContainerId(), e);
} }
} catch (YarnException e) { } catch (YarnException e) {
LOG.error("Failed to publish Container metrics for container " LOG.error(
+ container.getContainerId(), e.getMessage()); "Failed to publish Container metrics for container " +
container.getContainerId(), e.getMessage());
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Failed to publish Container metrics for container " LOG.debug("Failed to publish Container metrics for container " +
+ container.getContainerId(), e); container.getContainerId(), e);
}
} }
} }
} }
@ -210,6 +224,7 @@ public void reportContainerResourceUsage(Container container, Long pmemUsage,
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private void publishContainerCreatedEvent(ContainerEvent event) { private void publishContainerCreatedEvent(ContainerEvent event) {
if (publishNMContainerEvents) {
ContainerId containerId = event.getContainerID(); ContainerId containerId = event.getContainerID();
ContainerEntity entity = createContainerEntity(containerId); ContainerEntity entity = createContainerEntity(containerId);
Container container = context.getContainers().get(containerId); Container container = context.getContainers().get(containerId);
@ -242,10 +257,12 @@ private void publishContainerCreatedEvent(ContainerEvent event) {
dispatcher.getEventHandler().handle(new TimelinePublishEvent(entity, dispatcher.getEventHandler().handle(new TimelinePublishEvent(entity,
containerId.getApplicationAttemptId().getApplicationId())); containerId.getApplicationAttemptId().getApplicationId()));
} }
}
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private void publishContainerFinishedEvent(ContainerStatus containerStatus, private void publishContainerFinishedEvent(ContainerStatus containerStatus,
long containerFinishTime, long containerStartTime) { long containerFinishTime, long containerStartTime) {
if (publishNMContainerEvents) {
ContainerId containerId = containerStatus.getContainerId(); ContainerId containerId = containerStatus.getContainerId();
TimelineEntity entity = createContainerEntity(containerId); TimelineEntity entity = createContainerEntity(containerId);
@ -269,9 +286,11 @@ private void publishContainerFinishedEvent(ContainerStatus containerStatus,
dispatcher.getEventHandler().handle(new TimelinePublishEvent(entity, dispatcher.getEventHandler().handle(new TimelinePublishEvent(entity,
containerId.getApplicationAttemptId().getApplicationId())); containerId.getApplicationAttemptId().getApplicationId()));
} }
}
private void publishContainerLocalizationEvent( private void publishContainerLocalizationEvent(
ContainerLocalizationEvent event, String eventType) { ContainerLocalizationEvent event, String eventType) {
if (publishNMContainerEvents) {
Container container = event.getContainer(); Container container = event.getContainer();
ContainerId containerId = container.getContainerId(); ContainerId containerId = container.getContainerId();
TimelineEntity entity = createContainerEntity(containerId); TimelineEntity entity = createContainerEntity(containerId);
@ -283,8 +302,8 @@ private void publishContainerLocalizationEvent(
entity.setIdPrefix(TimelineServiceHelper. entity.setIdPrefix(TimelineServiceHelper.
invertLong(container.getContainerStartTime())); invertLong(container.getContainerStartTime()));
ApplicationId appId = ApplicationId appId = container.getContainerId().
container.getContainerId().getApplicationAttemptId().getApplicationId(); getApplicationAttemptId().getApplicationId();
try { try {
// no need to put it as part of publisher as timeline client already has // no need to put it as part of publisher as timeline client already has
// Queuing concept // Queuing concept
@ -292,8 +311,8 @@ private void publishContainerLocalizationEvent(
if (timelineClient != null) { if (timelineClient != null) {
timelineClient.putEntitiesAsync(entity); timelineClient.putEntitiesAsync(entity);
} else { } else {
LOG.error("Seems like client has been removed before the event could be" LOG.error("Seems like client has been removed before the event"
+ " published for " + container.getContainerId()); + " could be published for " + container.getContainerId());
} }
} catch (IOException e) { } catch (IOException e) {
LOG.error("Failed to publish Container metrics for container " LOG.error("Failed to publish Container metrics for container "
@ -311,6 +330,7 @@ private void publishContainerLocalizationEvent(
} }
} }
} }
}
private static ContainerEntity createContainerEntity( private static ContainerEntity createContainerEntity(
ContainerId containerId) { ContainerId containerId) {

View File

@ -67,6 +67,8 @@ public class TestNMTimelinePublisher {
conf.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f); conf.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f);
conf.setLong(YarnConfiguration.ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS, conf.setLong(YarnConfiguration.ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS,
3000L); 3000L);
conf.setBoolean(YarnConfiguration.NM_PUBLISH_CONTAINER_EVENTS_ENABLED,
true);
timelineClient = new DummyTimelineClient(null); timelineClient = new DummyTimelineClient(null);
Context context = createMockContext(); Context context = createMockContext();
dispatcher = new DrainDispatcher(); dispatcher = new DrainDispatcher();

View File

@ -142,7 +142,7 @@ New configuration parameters that are introduced with v.2 are marked bold.
| **`yarn.timeline-service.timeline-client.number-of-async-entities-to-merge`** | Time line V2 client tries to merge these many number of async entities (if available) and then call the REST ATS V2 API to submit. Defaults to `10`. | | **`yarn.timeline-service.timeline-client.number-of-async-entities-to-merge`** | Time line V2 client tries to merge these many number of async entities (if available) and then call the REST ATS V2 API to submit. Defaults to `10`. |
| **`yarn.timeline-service.hbase.coprocessor.app-final-value-retention-milliseconds`** | The setting that controls how long the final value of a metric of a completed app is retained before merging into the flow sum. Defaults to `259200000` (3 days). This should be set in the HBase cluster. | | **`yarn.timeline-service.hbase.coprocessor.app-final-value-retention-milliseconds`** | The setting that controls how long the final value of a metric of a completed app is retained before merging into the flow sum. Defaults to `259200000` (3 days). This should be set in the HBase cluster. |
| **`yarn.rm.system-metrics-publisher.emit-container-events`** | The setting that controls whether yarn container metrics is published to the timeline server or not by RM. This configuration setting is for ATS V2. Defaults to `false`. | | **`yarn.rm.system-metrics-publisher.emit-container-events`** | The setting that controls whether yarn container metrics is published to the timeline server or not by RM. This configuration setting is for ATS V2. Defaults to `false`. |
| **`yarn.nodemanager.emit-container-events`** | The setting that controls whether yarn container metrics is published to the timeline server or not by NM. This configuration setting is for ATS V2. Defaults to `true`. |
#### Security Configuration #### Security Configuration