YARN-4711. NM is going down with NPE's due to single thread processing of events by Timeline client (Naganarasimha G R via sjlee)

This commit is contained in:
Sangjin Lee 2016-03-28 15:50:03 -07:00
parent 6f6cc647d6
commit 84c35ac6c4
12 changed files with 279 additions and 179 deletions

View File

@ -117,7 +117,14 @@
<!-- Object cast is based on the event type --> <!-- Object cast is based on the event type -->
<Match> <Match>
<Class name="org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher$ApplicationEventHandler" /> <Class name="org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher" />
<Method name="publishApplicationEvent" />
<Bug pattern="BC_UNCONFIRMED_CAST" />
</Match>
<Match>
<Class name="org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher" />
<Method name="publishLocalizationEvent" />
<Bug pattern="BC_UNCONFIRMED_CAST" /> <Bug pattern="BC_UNCONFIRMED_CAST" />
</Match> </Match>

View File

@ -17,15 +17,6 @@
*/ */
package org.apache.hadoop.yarn.api.records.timelineservice; package org.apache.hadoop.yarn.api.records.timelineservice;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.yarn.util.TimelineServiceHelper;
import org.codehaus.jackson.annotate.JsonSetter;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
@ -33,6 +24,16 @@ import java.util.NavigableSet;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.yarn.util.TimelineServiceHelper;
import org.codehaus.jackson.annotate.JsonSetter;
/** /**
* The basic timeline entity data structure for timeline service v2. Timeline * The basic timeline entity data structure for timeline service v2. Timeline
* entity objects are not thread safe and should not be accessed concurrently. * entity objects are not thread safe and should not be accessed concurrently.
@ -564,6 +565,10 @@ public class TimelineEntity implements Comparable<TimelineEntity> {
} }
public String toString() { public String toString() {
if (real == null) {
return identifier.toString(); return identifier.toString();
} else {
return real.toString();
}
} }
} }

View File

@ -429,9 +429,8 @@ public class TimelineClientImpl extends TimelineClient {
URI uri = constructResURI(getConfig(), timelineServiceAddress, true); URI uri = constructResURI(getConfig(), timelineServiceAddress, true);
putObjects(uri, path, params, obj); putObjects(uri, path, params, obj);
needRetry = false; needRetry = false;
} catch (Exception e) { } catch (IOException e) {
// TODO only handle exception for timelineServiceAddress being updated. // handle exception for timelineServiceAddress being updated.
// skip retry for other exceptions.
checkRetryWithSleep(retries, e); checkRetryWithSleep(retries, e);
retries--; retries--;
} }
@ -458,29 +457,27 @@ public class TimelineClientImpl extends TimelineClient {
* @param retries * @param retries
* @param e * @param e
*/ */
private void checkRetryWithSleep(int retries, Exception e) throws private void checkRetryWithSleep(int retries, IOException e)
YarnException, IOException { throws YarnException, IOException {
if (retries > 0) { if (retries > 0) {
try { try {
Thread.sleep(this.serviceRetryInterval); Thread.sleep(this.serviceRetryInterval);
} catch (InterruptedException ex) { } catch (InterruptedException ex) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
throw new YarnException("Interrupted while retrying to connect to ATS");
} }
} else { } else {
LOG.error("TimelineClient has reached to max retry times :" + StringBuilder msg =
this.maxServiceRetries + " for service address: " + new StringBuilder("TimelineClient has reached to max retry times : ");
timelineServiceAddress); msg.append(this.maxServiceRetries);
if (e instanceof YarnException) { msg.append(" for service address: ");
throw (YarnException)e; msg.append(timelineServiceAddress);
} else if (e instanceof IOException) { LOG.error(msg.toString());
throw (IOException)e; throw new IOException(msg.toString(), e);
} else {
throw new YarnException(e);
}
} }
} }
private void putObjects( protected void putObjects(
URI base, String path, MultivaluedMap<String, String> params, Object obj) URI base, String path, MultivaluedMap<String, String> params, Object obj)
throws IOException, YarnException { throws IOException, YarnException {
ClientResponse resp; ClientResponse resp;
@ -636,17 +633,19 @@ public class TimelineClientImpl extends TimelineClient {
/** /**
* Poll TimelineServiceAddress for maximum of retries times if it is null. * Poll TimelineServiceAddress for maximum of retries times if it is null.
*
* @param retries * @param retries
* @return the left retry times * @return the left retry times
* @throws IOException
*/ */
private int pollTimelineServiceAddress(int retries) { private int pollTimelineServiceAddress(int retries) throws YarnException {
while (timelineServiceAddress == null && retries > 0) { while (timelineServiceAddress == null && retries > 0) {
try { try {
Thread.sleep(this.serviceRetryInterval); Thread.sleep(this.serviceRetryInterval);
} catch (InterruptedException e) { } catch (InterruptedException e) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
throw new YarnException("Interrupted while trying to connect ATS");
} }
// timelineServiceAddress = getTimelineServiceAddress();
retries--; retries--;
} }
return retries; return retries;

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.client.api.impl; package org.apache.hadoop.yarn.client.api.impl;
import java.io.IOException; import java.io.IOException;
import java.net.URI;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -34,22 +35,32 @@ import org.apache.hadoop.yarn.exceptions.YarnException;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TestName;
public class TestTimelineClientV2Impl { public class TestTimelineClientV2Impl {
private static final Log LOG = private static final Log LOG =
LogFactory.getLog(TestTimelineClientV2Impl.class); LogFactory.getLog(TestTimelineClientV2Impl.class);
private TestV2TimelineClient client; private TestV2TimelineClient client;
private static long TIME_TO_SLEEP = 150; private static long TIME_TO_SLEEP = 150;
private static final String EXCEPTION_MSG = "Exception in the content";
@Before @Before
public void setup() { public void setup() {
YarnConfiguration conf = new YarnConfiguration(); conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true); conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
conf.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 1.0f); conf.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 1.0f);
conf.setInt(YarnConfiguration.NUMBER_OF_ASYNC_ENTITIES_TO_MERGE, 3); conf.setInt(YarnConfiguration.NUMBER_OF_ASYNC_ENTITIES_TO_MERGE, 3);
if (!currTestName.getMethodName()
.contains("testRetryOnConnectionFailure")) {
client = createTimelineClient(conf); client = createTimelineClient(conf);
} }
}
@Rule
public TestName currTestName = new TestName();
private YarnConfiguration conf;
private TestV2TimelineClient createTimelineClient(YarnConfiguration conf) { private TestV2TimelineClient createTimelineClient(YarnConfiguration conf) {
ApplicationId id = ApplicationId.newInstance(0, 0); ApplicationId id = ApplicationId.newInstance(0, 0);
@ -59,9 +70,34 @@ public class TestTimelineClientV2Impl {
return client; return client;
} }
private class TestV2TimelineClient extends TimelineClientImpl { private class TestV2TimelineClientForExceptionHandling
extends TimelineClientImpl {
public TestV2TimelineClientForExceptionHandling(ApplicationId id) {
super(id);
}
protected boolean throwYarnException;
public void setThrowYarnException(boolean throwYarnException) {
this.throwYarnException = throwYarnException;
}
@Override
protected void putObjects(URI base, String path,
MultivaluedMap<String, String> params, Object obj)
throws IOException, YarnException {
if (throwYarnException) {
throw new YarnException(EXCEPTION_MSG);
} else {
throw new IOException(
"Failed to get the response from the timeline server.");
}
}
}
private class TestV2TimelineClient
extends TestV2TimelineClientForExceptionHandling {
private boolean sleepBeforeReturn; private boolean sleepBeforeReturn;
private boolean throwException;
private List<TimelineEntities> publishedEntities; private List<TimelineEntities> publishedEntities;
@ -75,10 +111,6 @@ public class TestTimelineClientV2Impl {
this.sleepBeforeReturn = sleepBeforeReturn; this.sleepBeforeReturn = sleepBeforeReturn;
} }
public void setThrowException(boolean throwException) {
this.throwException = throwException;
}
public int getNumOfTimelineEntitiesPublished() { public int getNumOfTimelineEntitiesPublished() {
return publishedEntities.size(); return publishedEntities.size();
} }
@ -91,7 +123,7 @@ public class TestTimelineClientV2Impl {
protected void putObjects(String path, protected void putObjects(String path,
MultivaluedMap<String, String> params, Object obj) MultivaluedMap<String, String> params, Object obj)
throws IOException, YarnException { throws IOException, YarnException {
if (throwException) { if (throwYarnException) {
throw new YarnException("ActualException"); throw new YarnException("ActualException");
} }
publishedEntities.add((TimelineEntities) obj); publishedEntities.add((TimelineEntities) obj);
@ -105,6 +137,45 @@ public class TestTimelineClientV2Impl {
} }
} }
@Test
public void testExceptionMultipleRetry() {
TestV2TimelineClientForExceptionHandling client =
new TestV2TimelineClientForExceptionHandling(
ApplicationId.newInstance(0, 0));
int maxRetries = 2;
conf.setInt(YarnConfiguration.TIMELINE_SERVICE_CLIENT_MAX_RETRIES,
maxRetries);
client.init(conf);
client.start();
client.setTimelineServiceAddress("localhost:12345");
try {
client.putEntities(new TimelineEntity());
} catch (IOException e) {
Assert.fail("YARN exception is expected");
} catch (YarnException e) {
Throwable cause = e.getCause();
Assert.assertTrue("IOException is expected",
cause instanceof IOException);
Assert.assertTrue("YARN exception is expected",
cause.getMessage().contains(
"TimelineClient has reached to max retry times : " + maxRetries));
}
client.setThrowYarnException(true);
try {
client.putEntities(new TimelineEntity());
} catch (IOException e) {
Assert.fail("YARN exception is expected");
} catch (YarnException e) {
Throwable cause = e.getCause();
Assert.assertTrue("YARN exception is expected",
cause instanceof YarnException);
Assert.assertTrue("YARN exception is expected",
cause.getMessage().contains(EXCEPTION_MSG));
}
client.stop();
}
@Test @Test
public void testPostEntities() throws Exception { public void testPostEntities() throws Exception {
try { try {
@ -189,7 +260,7 @@ public class TestTimelineClientV2Impl {
@Test @Test
public void testExceptionCalls() throws Exception { public void testExceptionCalls() throws Exception {
client.setThrowException(true); client.setThrowYarnException(true);
try { try {
client.putEntitiesAsync(generateEntity("1")); client.putEntitiesAsync(generateEntity("1"));
} catch (YarnException e) { } catch (YarnException e) {

View File

@ -69,4 +69,12 @@ public class ContainerMetricsConstants {
public static final String ALLOCATED_HOST_HTTP_ADDRESS_ENTITY_INFO = public static final String ALLOCATED_HOST_HTTP_ADDRESS_ENTITY_INFO =
"YARN_CONTAINER_ALLOCATED_HOST_HTTP_ADDRESS"; "YARN_CONTAINER_ALLOCATED_HOST_HTTP_ADDRESS";
// Event of this type will be emitted by NM.
public static final String LOCALIZATION_START_EVENT_TYPE =
"YARN_NM_CONTAINER_LOCALIZATION_STARTED";
// Event of this type will be emitted by NM.
public static final String LOCALIZATION_FINISHED_EVENT_TYPE =
"YARN_NM_CONTAINER_LOCALIZATION_FINISHED";
} }

View File

@ -55,7 +55,6 @@ import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeLabel; import org.apache.hadoop.yarn.api.records.NodeLabel;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceUtilization; import org.apache.hadoop.yarn.api.records.ResourceUtilization;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
@ -89,6 +88,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils; import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
@ -983,9 +983,11 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
LOG.debug("Sync a new collector address: " + collectorAddr + LOG.debug("Sync a new collector address: " + collectorAddr +
" for application: " + appId + " from RM."); " for application: " + appId + " from RM.");
} }
TimelineClient client = application.getTimelineClient(); NMTimelinePublisher nmTimelinePublisher =
if (client != null) { context.getNMTimelinePublisher();
client.setTimelineServiceAddress(collectorAddr); if (nmTimelinePublisher != null) {
nmTimelinePublisher.setTimelineServiceAddress(
application.getAppId(), collectorAddr);
} }
} }
} }

View File

@ -29,7 +29,6 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ipc.Server; import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.ipc.YarnRPC;
@ -42,6 +41,7 @@ import org.apache.hadoop.yarn.server.api.records.AppCollectorsMap;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
/** /**
* Service that handles collector information. It is used only if the timeline * Service that handles collector information. It is used only if the timeline
@ -116,10 +116,10 @@ public class NMCollectorService extends CompositeService implements
String collectorAddr = collector.getCollectorAddr(); String collectorAddr = collector.getCollectorAddr();
newCollectorsMap.put(appId, collectorAddr); newCollectorsMap.put(appId, collectorAddr);
// set registered collector address to TimelineClient. // set registered collector address to TimelineClient.
TimelineClient client = NMTimelinePublisher nmTimelinePublisher =
context.getApplications().get(appId).getTimelineClient(); context.getNMTimelinePublisher();
if (client != null) { if (nmTimelinePublisher != null) {
client.setTimelineServiceAddress(collectorAddr); nmTimelinePublisher.setTimelineServiceAddress(appId, collectorAddr);
} }
} }
((NodeManager.NMContext)context).addRegisteredCollectors( ((NodeManager.NMContext)context).addRegisteredCollectors(

View File

@ -22,7 +22,6 @@ import java.util.Map;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@ -41,7 +40,4 @@ public interface Application extends EventHandler<ApplicationEvent> {
String getFlowVersion(); String getFlowVersion();
long getFlowRunId(); long getFlowRunId();
TimelineClient getTimelineClient();
} }

View File

@ -58,6 +58,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppFinishedEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppFinishedEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppStartedEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppStartedEvent;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.state.InvalidStateTransitionException; import org.apache.hadoop.yarn.state.InvalidStateTransitionException;
import org.apache.hadoop.yarn.state.MultipleArcTransition; import org.apache.hadoop.yarn.state.MultipleArcTransition;
@ -83,7 +84,6 @@ public class ApplicationImpl implements Application {
private final ReadLock readLock; private final ReadLock readLock;
private final WriteLock writeLock; private final WriteLock writeLock;
private final Context context; private final Context context;
private TimelineClient timelineClient;
private static final Log LOG = LogFactory.getLog(ApplicationImpl.class); private static final Log LOG = LogFactory.getLog(ApplicationImpl.class);
@ -143,7 +143,7 @@ public class ApplicationImpl implements Application {
} }
this.flowContext = flowContext; this.flowContext = flowContext;
if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) { if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) {
createAndStartTimelineClient(conf); context.getNMTimelinePublisher().createTimelineClient(appId);
} }
} }
} }
@ -175,13 +175,6 @@ public class ApplicationImpl implements Application {
} }
} }
private void createAndStartTimelineClient(Configuration conf) {
// create and start timeline client
this.timelineClient = TimelineClient.createTimelineClient(appId);
timelineClient.init(conf);
timelineClient.start();
}
@Override @Override
public String getUser() { public String getUser() {
return user.toString(); return user.toString();
@ -192,11 +185,6 @@ public class ApplicationImpl implements Application {
return appId; return appId;
} }
@Override
public TimelineClient getTimelineClient() {
return timelineClient;
}
@Override @Override
public ApplicationState getApplicationState() { public ApplicationState getApplicationState() {
this.readLock.lock(); this.readLock.lock();
@ -575,9 +563,10 @@ public class ApplicationImpl implements Application {
registeredCollectors.remove(app.getAppId()); registeredCollectors.remove(app.getAppId());
} }
// stop timelineClient when application get finished. // stop timelineClient when application get finished.
TimelineClient timelineClient = app.getTimelineClient(); NMTimelinePublisher nmTimelinePublisher =
if (timelineClient != null) { app.context.getNMTimelinePublisher();
timelineClient.stop(); if (nmTimelinePublisher != null) {
nmTimelinePublisher.stopTimelineClient(app.getAppId());
} }
} }
} }

View File

@ -18,8 +18,10 @@
package org.apache.hadoop.yarn.server.nodemanager.timelineservice; package org.apache.hadoop.yarn.server.nodemanager.timelineservice;
import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -29,7 +31,6 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.timelineservice.ContainerEntity; import org.apache.hadoop.yarn.api.records.timelineservice.ContainerEntity;
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity; import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
@ -41,16 +42,15 @@ import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.metrics.ContainerMetricsConstants; import org.apache.hadoop.yarn.server.metrics.ContainerMetricsConstants;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationContainerFinishedEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationContainerFinishedEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl.ContainerMetric; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl.ContainerMetric;
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree; import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
import org.apache.hadoop.yarn.util.timeline.TimelineUtils; import org.apache.hadoop.yarn.util.timeline.TimelineUtils;
@ -72,9 +72,12 @@ public class NMTimelinePublisher extends CompositeService {
private String httpAddress; private String httpAddress;
protected final Map<ApplicationId, TimelineClient> appToClientMap;
public NMTimelinePublisher(Context context) { public NMTimelinePublisher(Context context) {
super(NMTimelinePublisher.class.getName()); super(NMTimelinePublisher.class.getName());
this.context = context; this.context = context;
appToClientMap = new ConcurrentHashMap<>();
} }
@Override @Override
@ -82,12 +85,6 @@ public class NMTimelinePublisher extends CompositeService {
dispatcher = new AsyncDispatcher(); dispatcher = new AsyncDispatcher();
dispatcher.register(NMTimelineEventType.class, dispatcher.register(NMTimelineEventType.class,
new ForwardingEventHandler()); new ForwardingEventHandler());
dispatcher
.register(ContainerEventType.class, new ContainerEventHandler());
dispatcher.register(ApplicationEventType.class,
new ApplicationEventHandler());
dispatcher.register(LocalizationEventType.class,
new LocalizationEventDispatcher());
addIfService(dispatcher); addIfService(dispatcher);
super.serviceInit(conf); super.serviceInit(conf);
} }
@ -112,7 +109,6 @@ public class NMTimelinePublisher extends CompositeService {
} }
} }
@SuppressWarnings("unchecked")
public void reportContainerResourceUsage(Container container, Long pmemUsage, public void reportContainerResourceUsage(Container container, Long pmemUsage,
Float cpuUsagePercentPerCore) { Float cpuUsagePercentPerCore) {
if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE || if (pmemUsage != ResourceCalculatorProcessTree.UNAVAILABLE ||
@ -133,15 +129,32 @@ public class NMTimelinePublisher extends CompositeService {
Math.round(cpuUsagePercentPerCore)); Math.round(cpuUsagePercentPerCore));
entity.addMetric(cpuMetric); entity.addMetric(cpuMetric);
} }
dispatcher.getEventHandler() ApplicationId appId = container.getContainerId().getApplicationAttemptId()
.handle(new TimelinePublishEvent(entity, container.getContainerId() .getApplicationId();
.getApplicationAttemptId().getApplicationId())); try {
// no need to put it as part of publisher as timeline client already has
// Queuing concept
TimelineClient timelineClient = getTimelineClient(appId);
if (timelineClient != null) {
timelineClient.putEntitiesAsync(entity);
} else {
LOG.error("Seems like client has been removed before the container"
+ " metric could be published for " + container.getContainerId());
}
} catch (IOException | YarnException e) {
LOG.error("Failed to publish Container metrics for container "
+ container.getContainerId(), e);
}
} }
} }
private void publishContainerCreatedEvent(ContainerEntity entity, @SuppressWarnings("unchecked")
ContainerId containerId, Resource resource, Priority priority, private void publishContainerCreatedEvent(ContainerEvent event) {
long timestamp) { ContainerId containerId = event.getContainerID();
ContainerEntity entity = createContainerEntity(containerId);
Container container = context.getContainers().get(containerId);
Resource resource = container.getResource();
Map<String, Object> entityInfo = new HashMap<String, Object>(); Map<String, Object> entityInfo = new HashMap<String, Object>();
entityInfo.put(ContainerMetricsConstants.ALLOCATED_MEMORY_ENTITY_INFO, entityInfo.put(ContainerMetricsConstants.ALLOCATED_MEMORY_ENTITY_INFO,
resource.getMemory()); resource.getMemory());
@ -152,7 +165,7 @@ public class NMTimelinePublisher extends CompositeService {
entityInfo.put(ContainerMetricsConstants.ALLOCATED_PORT_ENTITY_INFO, entityInfo.put(ContainerMetricsConstants.ALLOCATED_PORT_ENTITY_INFO,
nodeId.getPort()); nodeId.getPort());
entityInfo.put(ContainerMetricsConstants.ALLOCATED_PRIORITY_ENTITY_INFO, entityInfo.put(ContainerMetricsConstants.ALLOCATED_PRIORITY_ENTITY_INFO,
priority.toString()); container.getPriority().toString());
entityInfo.put( entityInfo.put(
ContainerMetricsConstants.ALLOCATED_HOST_HTTP_ADDRESS_ENTITY_INFO, ContainerMetricsConstants.ALLOCATED_HOST_HTTP_ADDRESS_ENTITY_INFO,
httpAddress); httpAddress);
@ -160,13 +173,15 @@ public class NMTimelinePublisher extends CompositeService {
TimelineEvent tEvent = new TimelineEvent(); TimelineEvent tEvent = new TimelineEvent();
tEvent.setId(ContainerMetricsConstants.CREATED_EVENT_TYPE); tEvent.setId(ContainerMetricsConstants.CREATED_EVENT_TYPE);
tEvent.setTimestamp(timestamp); tEvent.setTimestamp(event.getTimestamp());
entity.addEvent(tEvent); entity.addEvent(tEvent);
entity.setCreatedTime(timestamp); entity.setCreatedTime(event.getTimestamp());
putEntity(entity, containerId.getApplicationAttemptId().getApplicationId()); dispatcher.getEventHandler().handle(new TimelinePublishEvent(entity,
containerId.getApplicationAttemptId().getApplicationId()));
} }
@SuppressWarnings("unchecked")
private void publishContainerFinishedEvent(ContainerStatus containerStatus, private void publishContainerFinishedEvent(ContainerStatus containerStatus,
long timeStamp) { long timeStamp) {
ContainerId containerId = containerStatus.getContainerId(); ContainerId containerId = containerStatus.getContainerId();
@ -186,7 +201,38 @@ public class NMTimelinePublisher extends CompositeService {
tEvent.setInfo(eventInfo); tEvent.setInfo(eventInfo);
entity.addEvent(tEvent); entity.addEvent(tEvent);
putEntity(entity, containerId.getApplicationAttemptId().getApplicationId());
dispatcher.getEventHandler().handle(new TimelinePublishEvent(entity,
containerId.getApplicationAttemptId().getApplicationId()));
}
private void publishContainerLocalizationEvent(
ContainerLocalizationEvent event, String eventType) {
Container container = event.getContainer();
ContainerId containerId = container.getContainerId();
TimelineEntity entity = createContainerEntity(containerId);
TimelineEvent tEvent = new TimelineEvent();
tEvent.setId(eventType);
tEvent.setTimestamp(event.getTimestamp());
entity.addEvent(tEvent);
ApplicationId appId =
container.getContainerId().getApplicationAttemptId().getApplicationId();
try {
// no need to put it as part of publisher as timeline client already has
// Queuing concept
TimelineClient timelineClient = getTimelineClient(appId);
if (timelineClient != null) {
timelineClient.putEntitiesAsync(entity);
} else {
LOG.error("Seems like client has been removed before the event could be"
+ " published for " + container.getContainerId());
}
} catch (IOException | YarnException e) {
LOG.error("Failed to publish Container metrics for container "
+ container.getContainerId(), e);
}
} }
private static ContainerEntity createContainerEntity( private static ContainerEntity createContainerEntity(
@ -207,23 +253,33 @@ public class NMTimelinePublisher extends CompositeService {
LOG.debug("Publishing the entity " + entity + ", JSON-style content: " LOG.debug("Publishing the entity " + entity + ", JSON-style content: "
+ TimelineUtils.dumpTimelineRecordtoJSON(entity)); + TimelineUtils.dumpTimelineRecordtoJSON(entity));
} }
TimelineClient timelineClient = TimelineClient timelineClient = getTimelineClient(appId);
context.getApplications().get(appId).getTimelineClient(); if (timelineClient != null) {
timelineClient.putEntities(entity); timelineClient.putEntities(entity);
} else {
LOG.error("Seems like client has been removed before the entity "
+ "could be published for " + entity);
}
} catch (Exception e) { } catch (Exception e) {
LOG.error("Error when publishing entity " + entity, e); LOG.error("Error when publishing entity " + entity, e);
} }
} }
@SuppressWarnings("unchecked")
public void publishApplicationEvent(ApplicationEvent event) { public void publishApplicationEvent(ApplicationEvent event) {
// publish only when the desired event is received // publish only when the desired event is received
switch (event.getType()) { switch (event.getType()) {
case INIT_APPLICATION: case INIT_APPLICATION:
case FINISH_APPLICATION: case FINISH_APPLICATION:
case APPLICATION_CONTAINER_FINISHED:
case APPLICATION_LOG_HANDLING_FAILED: case APPLICATION_LOG_HANDLING_FAILED:
dispatcher.getEventHandler().handle(event); // TODO need to be handled in future,
// not sure to publish under which entity
break;
case APPLICATION_CONTAINER_FINISHED:
// this is actually used to publish the container Event
ApplicationContainerFinishedEvent evnt =
(ApplicationContainerFinishedEvent) event;
publishContainerFinishedEvent(evnt.getContainerStatus(),
event.getTimestamp());
break; break;
default: default:
@ -235,12 +291,11 @@ public class NMTimelinePublisher extends CompositeService {
} }
} }
@SuppressWarnings("unchecked")
public void publishContainerEvent(ContainerEvent event) { public void publishContainerEvent(ContainerEvent event) {
// publish only when the desired event is received // publish only when the desired event is received
switch (event.getType()) { switch (event.getType()) {
case INIT_CONTAINER: case INIT_CONTAINER:
dispatcher.getEventHandler().handle(event); publishContainerCreatedEvent(event);
break; break;
default: default:
@ -253,15 +308,17 @@ public class NMTimelinePublisher extends CompositeService {
} }
} }
@SuppressWarnings("unchecked")
public void publishLocalizationEvent(LocalizationEvent event) { public void publishLocalizationEvent(LocalizationEvent event) {
// publish only when the desired event is received // publish only when the desired event is received
switch (event.getType()) { switch (event.getType()) {
case CONTAINER_RESOURCES_LOCALIZED: case CONTAINER_RESOURCES_LOCALIZED:
case INIT_CONTAINER_RESOURCES: publishContainerLocalizationEvent((ContainerLocalizationEvent) event,
dispatcher.getEventHandler().handle(event); ContainerMetricsConstants.LOCALIZATION_FINISHED_EVENT_TYPE);
break;
case INIT_CONTAINER_RESOURCES:
publishContainerLocalizationEvent((ContainerLocalizationEvent) event,
ContainerMetricsConstants.LOCALIZATION_START_EVENT_TYPE);
break; break;
default: default:
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(event.getType() LOG.debug(event.getType()
@ -272,64 +329,6 @@ public class NMTimelinePublisher extends CompositeService {
} }
} }
private class ApplicationEventHandler implements
EventHandler<ApplicationEvent> {
@Override
public void handle(ApplicationEvent event) {
switch (event.getType()) {
case APPLICATION_CONTAINER_FINISHED:
// this is actually used to publish the container Event
ApplicationContainerFinishedEvent evnt =
(ApplicationContainerFinishedEvent) event;
publishContainerFinishedEvent(evnt.getContainerStatus(),
event.getTimestamp());
break;
default:
LOG.error("Seems like event type is captured only in "
+ "publishApplicationEvent method and not handled here");
break;
}
}
}
private class ContainerEventHandler implements EventHandler<ContainerEvent> {
@Override
public void handle(ContainerEvent event) {
ContainerId containerId = event.getContainerID();
Container container = context.getContainers().get(containerId);
long timestamp = event.getTimestamp();
ContainerEntity entity = createContainerEntity(containerId);
switch (event.getType()) {
case INIT_CONTAINER:
publishContainerCreatedEvent(entity, containerId,
container.getResource(), container.getPriority(), timestamp);
break;
default:
LOG.error("Seems like event type is captured only in "
+ "publishContainerEvent method and not handled here");
break;
}
}
}
private static final class LocalizationEventDispatcher implements
EventHandler<LocalizationEvent> {
@Override
public void handle(LocalizationEvent event) {
switch (event.getType()) {
case INIT_CONTAINER_RESOURCES:
case CONTAINER_RESOURCES_LOCALIZED:
// TODO after priority based flush jira is finished
break;
default:
LOG.error("Seems like event type is captured only in "
+ "publishLocalizationEvent method and not handled here");
break;
}
}
}
/** /**
* EventHandler implementation which forward events to NMMetricsPublisher. * EventHandler implementation which forward events to NMMetricsPublisher.
* Making use of it, NMMetricsPublisher can avoid to have a public handle * Making use of it, NMMetricsPublisher can avoid to have a public handle
@ -363,4 +362,33 @@ public class NMTimelinePublisher extends CompositeService {
return entityToPublish; return entityToPublish;
} }
} }
public void createTimelineClient(ApplicationId appId) {
if (!appToClientMap.containsKey(appId)) {
TimelineClient timelineClient =
TimelineClient.createTimelineClient(appId);
timelineClient.init(getConfig());
timelineClient.start();
appToClientMap.put(appId, timelineClient);
}
}
public void stopTimelineClient(ApplicationId appId) {
TimelineClient client = appToClientMap.remove(appId);
if (client != null) {
client.stop();
}
}
public void setTimelineServiceAddress(ApplicationId appId,
String collectorAddr) {
TimelineClient client = appToClientMap.get(appId);
if (client != null) {
client.setTimelineServiceAddress(collectorAddr);
}
}
private TimelineClient getTimelineClient(ApplicationId appId) {
return appToClientMap.get(appId);
}
} }

View File

@ -20,14 +20,12 @@ package org.apache.hadoop.yarn.server.nodemanager.timelineservice;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
@ -39,7 +37,6 @@ import org.apache.hadoop.yarn.api.records.timelineservice.TimelineMetric;
import org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl; import org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree; import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
import org.junit.Assert; import org.junit.Assert;
@ -53,20 +50,23 @@ public class TestNMTimelinePublisher {
public void testContainerResourceUsage() { public void testContainerResourceUsage() {
Context context = mock(Context.class); Context context = mock(Context.class);
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
ConcurrentMap<ApplicationId, Application> map = mock(ConcurrentMap.class); final DummyTimelineClient timelineClient = new DummyTimelineClient();
Application aApp = mock(Application.class);
when(map.get(any(ApplicationId.class))).thenReturn(aApp);
DummyTimelineClient timelineClient = new DummyTimelineClient();
when(aApp.getTimelineClient()).thenReturn(timelineClient);
when(context.getApplications()).thenReturn(map);
when(context.getNodeId()).thenReturn(NodeId.newInstance("localhost", 0)); when(context.getNodeId()).thenReturn(NodeId.newInstance("localhost", 0));
when(context.getHttpPort()).thenReturn(0); when(context.getHttpPort()).thenReturn(0);
NMTimelinePublisher publisher = new NMTimelinePublisher(context); NMTimelinePublisher publisher = new NMTimelinePublisher(context) {
public void createTimelineClient(ApplicationId appId) {
if (!appToClientMap.containsKey(appId)) {
appToClientMap.put(appId, timelineClient);
}
}
};
publisher.init(new Configuration()); publisher.init(new Configuration());
publisher.start(); publisher.start();
ApplicationId appId = ApplicationId.newInstance(0, 1);
publisher.createTimelineClient(appId);
Container aContainer = mock(Container.class); Container aContainer = mock(Container.class);
when(aContainer.getContainerId()).thenReturn(ContainerId.newContainerId( when(aContainer.getContainerId()).thenReturn(ContainerId.newContainerId(
ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1), ApplicationAttemptId.newInstance(appId, 1),
0L)); 0L));
publisher.reportContainerResourceUsage(aContainer, 1024L, 8F); publisher.reportContainerResourceUsage(aContainer, 1024L, 8F);
verifyPublishedResourceUsageMetrics(timelineClient, 1024L, 8); verifyPublishedResourceUsageMetrics(timelineClient, 1024L, 8);
@ -141,7 +141,7 @@ public class TestNMTimelinePublisher {
private TimelineEntity[] lastPublishedEntities; private TimelineEntity[] lastPublishedEntities;
@Override @Override
public void putEntities(TimelineEntity... entities) public void putEntitiesAsync(TimelineEntity... entities)
throws IOException, YarnException { throws IOException, YarnException {
this.lastPublishedEntities = entities; this.lastPublishedEntities = entities;
} }

View File

@ -101,9 +101,4 @@ public class MockApp implements Application {
public long getFlowRunId() { public long getFlowRunId() {
return flowRunId; return flowRunId;
} }
@Override
public TimelineClient getTimelineClient() {
return timelineClient;
}
} }