From cc16683cefe2611cf4de7819496aa54854f5394c Mon Sep 17 00:00:00 2001 From: Sangjin Lee Date: Mon, 11 Jan 2016 10:09:34 -0800 Subject: [PATCH] YARN-3995. Some of the NM events are not getting published due race condition when AM container finishes in NM (Naganarasimha G R via sjlee) --- .../hadoop/yarn/conf/YarnConfiguration.java | 5 ++++ .../src/main/resources/yarn-default.xml | 7 ++++++ .../PerNodeTimelineCollectorsAuxService.java | 25 +++++++++++++------ ...stPerNodeTimelineCollectorsAuxService.java | 11 ++++---- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index c43c8733e19..616e7c3901d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1981,6 +1981,11 @@ public class YarnConfiguration extends Configuration { public static final int DEFAULT_TIMELINE_SERVICE_WRITER_FLUSH_INTERVAL_SECONDS = 60; + public static final String ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS = + TIMELINE_SERVICE_PREFIX + "app-collector.linger-period.ms"; + + public static final int DEFAULT_ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS = 1000; + // mark app-history related configs @Private as application history is going // to be integrated into the timeline service @Private diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index a33c131c918..2d3de01571c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -2213,6 +2213,13 @@ 60 + + Time period till which the application collector will be alive + in NM, after the application master container finishes. + yarn.timeline-service.app-collector.linger-period.ms + 1000 + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java index 0319e34a5e7..b73853003b2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java @@ -19,6 +19,9 @@ package org.apache.hadoop.yarn.server.timelineservice.collector; import java.nio.ByteBuffer; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -54,6 +57,8 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService { private static final int SHUTDOWN_HOOK_PRIORITY = 30; private final NodeTimelineCollectorManager collectorManager; + private long collectorLingerPeriod; + private ScheduledExecutorService scheduler; public PerNodeTimelineCollectorsAuxService() { this(new NodeTimelineCollectorManager()); @@ -70,6 +75,10 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService { if (!YarnConfiguration.timelineServiceV2Enabled(conf)) { throw new YarnException("Timeline service v2 is not enabled"); } + collectorLingerPeriod = + conf.getLong(YarnConfiguration.ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS, + YarnConfiguration.DEFAULT_ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS); + scheduler = Executors.newSingleThreadScheduledExecutor(); collectorManager.init(conf); super.serviceInit(conf); } @@ -82,6 +91,12 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService { @Override protected void serviceStop() throws Exception { + scheduler.shutdown(); + if (!scheduler.awaitTermination(collectorLingerPeriod, + TimeUnit.MILLISECONDS)) { + LOG.warn( + "Scheduler terminated before removing the application collectors"); + } collectorManager.stop(); super.serviceStop(); } @@ -141,17 +156,11 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService { if (context.getContainerType() == ContainerType.APPLICATION_MASTER) { final ApplicationId appId = context.getContainerId().getApplicationAttemptId().getApplicationId(); - new Thread(new Runnable() { + scheduler.schedule(new Runnable() { public void run() { - try { - // TODO Temporary Fix until solution for YARN-3995 is finalized. - Thread.sleep(1000l); - } catch (InterruptedException e) { - e.printStackTrace(); - } removeApplication(appId); } - }).start(); + }, collectorLingerPeriod, TimeUnit.MILLISECONDS); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/collector/TestPerNodeTimelineCollectorsAuxService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/collector/TestPerNodeTimelineCollectorsAuxService.java index 4fdf47e062b..f2775d5657c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/collector/TestPerNodeTimelineCollectorsAuxService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/test/java/org/apache/hadoop/yarn/server/timelineservice/collector/TestPerNodeTimelineCollectorsAuxService.java @@ -22,12 +22,14 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import static org.mockito.Mockito.any; +import static org.mockito.Matchers.any; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.when; +import java.io.IOException; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.Shell; @@ -45,8 +47,6 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.GetTimelineCollectorCon import org.junit.After; import org.junit.Test; -import java.io.IOException; - public class TestPerNodeTimelineCollectorsAuxService { private ApplicationAttemptId appAttemptId; private PerNodeTimelineCollectorsAuxService auxService; @@ -103,8 +103,9 @@ public class TestPerNodeTimelineCollectorsAuxService { when(context.getContainerType()).thenReturn( ContainerType.APPLICATION_MASTER); auxService.stopContainer(context); - - // TODO Temporary Fix until solution for YARN-3995 is finalized + // auxService should have the app's collector and need to remove only after + // a configured period + assertTrue(auxService.hasApplication(appAttemptId.getApplicationId())); for (int i = 0; i < 4; i++) { Thread.sleep(500l); if (!auxService.hasApplication(appAttemptId.getApplicationId())) {