YARN-3995. Some of the NM events are not getting published due race condition when AM container finishes in NM (Naganarasimha G R via sjlee)
This commit is contained in:
parent
829cceebc0
commit
cc16683cef
|
@ -1981,6 +1981,11 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final int
|
||||
DEFAULT_TIMELINE_SERVICE_WRITER_FLUSH_INTERVAL_SECONDS = 60;
|
||||
|
||||
public static final String ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS =
|
||||
TIMELINE_SERVICE_PREFIX + "app-collector.linger-period.ms";
|
||||
|
||||
public static final int DEFAULT_ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS = 1000;
|
||||
|
||||
// mark app-history related configs @Private as application history is going
|
||||
// to be integrated into the timeline service
|
||||
@Private
|
||||
|
|
|
@ -2213,6 +2213,13 @@
|
|||
<value>60</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Time period till which the application collector will be alive
|
||||
in NM, after the application master container finishes.</description>
|
||||
<name>yarn.timeline-service.app-collector.linger-period.ms</name>
|
||||
<value>1000</value>
|
||||
</property>
|
||||
|
||||
<!-- Shared Cache Configuration -->
|
||||
|
||||
<property>
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
package org.apache.hadoop.yarn.server.timelineservice.collector;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -54,6 +57,8 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
|
|||
private static final int SHUTDOWN_HOOK_PRIORITY = 30;
|
||||
|
||||
private final NodeTimelineCollectorManager collectorManager;
|
||||
private long collectorLingerPeriod;
|
||||
private ScheduledExecutorService scheduler;
|
||||
|
||||
public PerNodeTimelineCollectorsAuxService() {
|
||||
this(new NodeTimelineCollectorManager());
|
||||
|
@ -70,6 +75,10 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
|
|||
if (!YarnConfiguration.timelineServiceV2Enabled(conf)) {
|
||||
throw new YarnException("Timeline service v2 is not enabled");
|
||||
}
|
||||
collectorLingerPeriod =
|
||||
conf.getLong(YarnConfiguration.ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS,
|
||||
YarnConfiguration.DEFAULT_ATS_APP_COLLECTOR_LINGER_PERIOD_IN_MS);
|
||||
scheduler = Executors.newSingleThreadScheduledExecutor();
|
||||
collectorManager.init(conf);
|
||||
super.serviceInit(conf);
|
||||
}
|
||||
|
@ -82,6 +91,12 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
|
|||
|
||||
@Override
|
||||
protected void serviceStop() throws Exception {
|
||||
scheduler.shutdown();
|
||||
if (!scheduler.awaitTermination(collectorLingerPeriod,
|
||||
TimeUnit.MILLISECONDS)) {
|
||||
LOG.warn(
|
||||
"Scheduler terminated before removing the application collectors");
|
||||
}
|
||||
collectorManager.stop();
|
||||
super.serviceStop();
|
||||
}
|
||||
|
@ -141,17 +156,11 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
|
|||
if (context.getContainerType() == ContainerType.APPLICATION_MASTER) {
|
||||
final ApplicationId appId =
|
||||
context.getContainerId().getApplicationAttemptId().getApplicationId();
|
||||
new Thread(new Runnable() {
|
||||
scheduler.schedule(new Runnable() {
|
||||
public void run() {
|
||||
try {
|
||||
// TODO Temporary Fix until solution for YARN-3995 is finalized.
|
||||
Thread.sleep(1000l);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
removeApplication(appId);
|
||||
}
|
||||
}).start();
|
||||
}, collectorLingerPeriod, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,12 +22,14 @@ import static org.junit.Assert.assertEquals;
|
|||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.Mockito.any;
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Mockito.doReturn;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.ExitUtil;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
|
@ -45,8 +47,6 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.GetTimelineCollectorCon
|
|||
import org.junit.After;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestPerNodeTimelineCollectorsAuxService {
|
||||
private ApplicationAttemptId appAttemptId;
|
||||
private PerNodeTimelineCollectorsAuxService auxService;
|
||||
|
@ -103,8 +103,9 @@ public class TestPerNodeTimelineCollectorsAuxService {
|
|||
when(context.getContainerType()).thenReturn(
|
||||
ContainerType.APPLICATION_MASTER);
|
||||
auxService.stopContainer(context);
|
||||
|
||||
// TODO Temporary Fix until solution for YARN-3995 is finalized
|
||||
// auxService should have the app's collector and need to remove only after
|
||||
// a configured period
|
||||
assertTrue(auxService.hasApplication(appAttemptId.getApplicationId()));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
Thread.sleep(500l);
|
||||
if (!auxService.hasApplication(appAttemptId.getApplicationId())) {
|
||||
|
|
Loading…
Reference in New Issue