YARN-3792. Test case failures in TestDistributedShell and some issue fixes related to ATSV2 (Naganarasimha G R via sjlee)

(cherry picked from commit 84f37f1c7eefec6d139cbf091c50d6c06f734323)
This commit is contained in:
Sangjin Lee 2015-06-22 20:47:56 -07:00
parent 92d90c3a24
commit 22e7ae5771
10 changed files with 77 additions and 53 deletions

View File

@ -497,7 +497,7 @@ public class Client {
} }
if (cliParser.hasOption("flow_run_id")) { if (cliParser.hasOption("flow_run_id")) {
try { try {
flowRunId = Long.valueOf(cliParser.getOptionValue("flow_run_id")); flowRunId = Long.parseLong(cliParser.getOptionValue("flow_run_id"));
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"Flow run is not a valid long value", e); "Flow run is not a valid long value", e);

View File

@ -60,6 +60,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport; import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.YarnApplicationState; import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain; import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities; import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
@ -129,7 +130,6 @@ public class TestDistributedShell {
private void setupInternal(int numNodeManager, float timelineVersion) private void setupInternal(int numNodeManager, float timelineVersion)
throws Exception { throws Exception {
LOG.info("Starting up YARN cluster"); LOG.info("Starting up YARN cluster");
conf = new YarnConfiguration(); conf = new YarnConfiguration();
@ -140,7 +140,6 @@ public class TestDistributedShell {
boolean enableATSServer = true; boolean enableATSServer = true;
// disable aux-service based timeline aggregators // disable aux-service based timeline aggregators
conf.set(YarnConfiguration.NM_AUX_SERVICES, ""); conf.set(YarnConfiguration.NM_AUX_SERVICES, "");
conf.set(YarnConfiguration.NM_VMEM_PMEM_RATIO, "8"); conf.set(YarnConfiguration.NM_VMEM_PMEM_RATIO, "8");
conf.set(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class.getName()); conf.set(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class.getName());
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true); conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
@ -155,7 +154,9 @@ public class TestDistributedShell {
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true);
conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING, conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
true); true);
conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true); conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
true);
conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, false);
// ATS version specific settings // ATS version specific settings
if (timelineVersion == 1.0f) { if (timelineVersion == 1.0f) {
@ -180,6 +181,9 @@ public class TestDistributedShell {
conf.set(YarnConfiguration.NM_AUX_SERVICES, TIMELINE_AUX_SERVICE_NAME); conf.set(YarnConfiguration.NM_AUX_SERVICES, TIMELINE_AUX_SERVICE_NAME);
conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + TIMELINE_AUX_SERVICE_NAME conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + TIMELINE_AUX_SERVICE_NAME
+ ".class", PerNodeTimelineCollectorsAuxService.class.getName()); + ".class", PerNodeTimelineCollectorsAuxService.class.getName());
conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
false);
} else { } else {
Assert.fail("Wrong timeline version number: " + timelineVersion); Assert.fail("Wrong timeline version number: " + timelineVersion);
} }
@ -187,7 +191,7 @@ public class TestDistributedShell {
if (yarnCluster == null) { if (yarnCluster == null) {
yarnCluster = yarnCluster =
new MiniYARNCluster(TestDistributedShell.class.getSimpleName(), 1, new MiniYARNCluster(TestDistributedShell.class.getSimpleName(), 1,
numNodeManager, 1, 1, enableATSServer); numNodeManager, 1, 1);
yarnCluster.init(conf); yarnCluster.init(conf);
yarnCluster.start(); yarnCluster.start();
@ -390,13 +394,15 @@ public class TestDistributedShell {
if (checkHostname(appReport.getHost()) && appReport.getRpcPort() == -1) { if (checkHostname(appReport.getHost()) && appReport.getRpcPort() == -1) {
verified = true; verified = true;
} }
if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED) {
if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED
&& appReport.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) {
break; break;
} }
} }
Assert.assertTrue(errorMessage, verified); Assert.assertTrue(errorMessage, verified);
t.join(); t.join();
LOG.info("Client run completed. Result=" + result); LOG.info("Client run completed for testDSShell. Result=" + result);
Assert.assertTrue(result.get()); Assert.assertTrue(result.get());
if (timelineVersionWatcher.getTimelineVersion() == 1.5f) { if (timelineVersionWatcher.getTimelineVersion() == 1.5f) {
@ -477,9 +483,9 @@ public class TestDistributedShell {
} }
} }
private void checkTimelineV2( private void checkTimelineV2(boolean haveDomain, ApplicationId appId,
boolean haveDomain, ApplicationId appId, boolean defaultFlow) boolean defaultFlow) throws Exception {
throws Exception { LOG.info("Started checkTimelineV2 ");
// For PoC check in /tmp/timeline_service_data YARN-3264 // For PoC check in /tmp/timeline_service_data YARN-3264
String tmpRoot = String tmpRoot =
FileSystemTimelineWriterImpl.DEFAULT_TIMELINE_SERVICE_STORAGE_DIR_ROOT FileSystemTimelineWriterImpl.DEFAULT_TIMELINE_SERVICE_STORAGE_DIR_ROOT
@ -530,12 +536,29 @@ public class TestDistributedShell {
verifyEntityTypeFileExists(basePath, verifyEntityTypeFileExists(basePath,
TimelineEntityType.YARN_APPLICATION.toString(), TimelineEntityType.YARN_APPLICATION.toString(),
appMetricsTimestampFileName); appMetricsTimestampFileName);
verifyStringExistsSpecifiedTimes(appEntityFile, Assert.assertEquals(
ApplicationMetricsConstants.CREATED_EVENT_TYPE, 1, "Application created event should be published atleast once",
"Application created event should be published atleast once"); 1,
verifyStringExistsSpecifiedTimes(appEntityFile, getNumOfStringOccurences(appEntityFile,
ApplicationMetricsConstants.FINISHED_EVENT_TYPE, 1, ApplicationMetricsConstants.CREATED_EVENT_TYPE));
"Application finished event should be published atleast once");
// to avoid race condition of testcase, atleast check 4 times with sleep
// of 500ms
long numOfStringOccurences = 0;
for (int i = 0; i < 4; i++) {
numOfStringOccurences =
getNumOfStringOccurences(appEntityFile,
ApplicationMetricsConstants.FINISHED_EVENT_TYPE);
if (numOfStringOccurences > 0) {
break;
} else {
Thread.sleep(500l);
}
}
Assert.assertEquals(
"Application finished event should be published atleast once",
1,
numOfStringOccurences);
// Verify RM posting AppAttempt life cycle Events are getting published // Verify RM posting AppAttempt life cycle Events are getting published
String appAttemptMetricsTimestampFileName = String appAttemptMetricsTimestampFileName =
@ -546,12 +569,17 @@ public class TestDistributedShell {
verifyEntityTypeFileExists(basePath, verifyEntityTypeFileExists(basePath,
TimelineEntityType.YARN_APPLICATION_ATTEMPT.toString(), TimelineEntityType.YARN_APPLICATION_ATTEMPT.toString(),
appAttemptMetricsTimestampFileName); appAttemptMetricsTimestampFileName);
verifyStringExistsSpecifiedTimes(appAttemptEntityFile, Assert.assertEquals(
AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE, 1, "AppAttempt register event should be published atleast once",
"AppAttempt register event should be published atleast once"); 1,
verifyStringExistsSpecifiedTimes(appAttemptEntityFile, getNumOfStringOccurences(appAttemptEntityFile,
AppAttemptMetricsConstants.FINISHED_EVENT_TYPE, 1, AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE));
"AppAttempt finished event should be published atleast once");
Assert.assertEquals(
"AppAttempt finished event should be published atleast once",
1,
getNumOfStringOccurences(appAttemptEntityFile,
AppAttemptMetricsConstants.FINISHED_EVENT_TYPE));
} finally { } finally {
FileUtils.deleteDirectory(tmpRootFolder.getParentFile()); FileUtils.deleteDirectory(tmpRootFolder.getParentFile());
} }
@ -570,8 +598,7 @@ public class TestDistributedShell {
return entityFile; return entityFile;
} }
private void verifyStringExistsSpecifiedTimes(File entityFile, private long getNumOfStringOccurences(File entityFile, String searchString)
String searchString, long expectedNumOfTimes, String errorMsg)
throws IOException { throws IOException {
BufferedReader reader = null; BufferedReader reader = null;
String strLine; String strLine;
@ -585,7 +612,7 @@ public class TestDistributedShell {
} finally { } finally {
reader.close(); reader.close();
} }
Assert.assertEquals(errorMsg, expectedNumOfTimes, actualCount); return actualCount;
} }
/** /**
@ -1261,4 +1288,3 @@ public class TestDistributedShell {
return numOfWords; return numOfWords;
} }
} }

View File

@ -30,7 +30,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TestName;
import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap;

View File

@ -418,6 +418,14 @@ public class TimelineClientImpl extends TimelineClient {
// timelineServiceAddress could haven't be initialized yet // timelineServiceAddress could haven't be initialized yet
// or stale (only for new timeline service) // or stale (only for new timeline service)
int retries = pollTimelineServiceAddress(this.maxServiceRetries); int retries = pollTimelineServiceAddress(this.maxServiceRetries);
if (timelineServiceAddress == null) {
String errMessage = "TimelineClient has reached to max retry times : "
+ this.maxServiceRetries
+ ", but failed to fetch timeline service address. Please verify"
+ " Timeline Auxillary Service is configured in all the NMs";
LOG.error(errMessage);
throw new YarnException(errMessage);
}
// timelineServiceAddress could be stale, add retry logic here. // timelineServiceAddress could be stale, add retry logic here.
boolean needRetry = true; boolean needRetry = true;

View File

@ -130,11 +130,11 @@ public class ApplicationImpl implements Application {
context, -1); context, -1);
Configuration conf = context.getConf(); Configuration conf = context.getConf();
if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) { if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) {
createAndStartTimelienClient(conf); createAndStartTimelineClient(conf);
} }
} }
private void createAndStartTimelienClient(Configuration conf) { private void createAndStartTimelineClient(Configuration conf) {
// create and start timeline client // create and start timeline client
this.timelineClient = TimelineClient.createTimelineClient(appId); this.timelineClient = TimelineClient.createTimelineClient(appId);
timelineClient.init(conf); timelineClient.init(conf);

View File

@ -96,10 +96,7 @@ public class ContainersMonitorImpl extends AbstractService implements
// For posting entities in new timeline service in a non-blocking way // For posting entities in new timeline service in a non-blocking way
// TODO replace with event loop in TimelineClient. // TODO replace with event loop in TimelineClient.
private static ExecutorService threadPool = private static ExecutorService threadPool;
Executors.newCachedThreadPool(
new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
.build());
@Private @Private
public static enum ContainerMetric { public static enum ContainerMetric {
@ -225,6 +222,10 @@ public class ContainersMonitorImpl extends AbstractService implements
if (publishContainerMetricsToTimelineService) { if (publishContainerMetricsToTimelineService) {
LOG.info("NodeManager has been configured to publish container " + LOG.info("NodeManager has been configured to publish container " +
"metrics to Timeline Service V2."); "metrics to Timeline Service V2.");
threadPool =
Executors.newCachedThreadPool(
new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
.build());
} else { } else {
LOG.warn("NodeManager has not been configured to publish container " + LOG.warn("NodeManager has not been configured to publish container " +
"metrics to Timeline Service V2."); "metrics to Timeline Service V2.");
@ -280,6 +281,9 @@ public class ContainersMonitorImpl extends AbstractService implements
// TODO remove threadPool after adding non-blocking call in TimelineClient // TODO remove threadPool after adding non-blocking call in TimelineClient
private static void shutdownAndAwaitTermination() { private static void shutdownAndAwaitTermination() {
if (threadPool == null) {
return;
}
threadPool.shutdown(); threadPool.shutdown();
try { try {
if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) { if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
@ -689,7 +693,6 @@ public class ContainersMonitorImpl extends AbstractService implements
timelineClient.putEntities(entity); timelineClient.putEntities(entity);
} catch (IOException|YarnException e) { } catch (IOException|YarnException e) {
LOG.error("putEntityNonBlocking get failed: " + e); LOG.error("putEntityNonBlocking get failed: " + e);
throw new RuntimeException(e.toString());
} }
} }
}; };

View File

@ -56,7 +56,7 @@ public class RMTimelineCollectorManager extends TimelineCollectorManager {
if (parts.length != 2 || parts[1].isEmpty()) { if (parts.length != 2 || parts[1].isEmpty()) {
continue; continue;
} }
switch (parts[0]) { switch (parts[0].toUpperCase()) {
case TimelineUtils.FLOW_NAME_TAG_PREFIX: case TimelineUtils.FLOW_NAME_TAG_PREFIX:
collector.getTimelineEntityContext().setFlowName(parts[1]); collector.getTimelineEntityContext().setFlowName(parts[1]);
break; break;

View File

@ -48,21 +48,11 @@ import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
/**
*
* It is a singleton, and instances should be obtained via
* {@link #getInstance()}.
*
*/
@Private @Private
@Unstable @Unstable
public class NodeTimelineCollectorManager extends TimelineCollectorManager { public class NodeTimelineCollectorManager extends TimelineCollectorManager {
private static final Log LOG = private static final Log LOG =
LogFactory.getLog(NodeTimelineCollectorManager.class); LogFactory.getLog(NodeTimelineCollectorManager.class);
private static final NodeTimelineCollectorManager INSTANCE =
new NodeTimelineCollectorManager();
// REST server for this collector manager // REST server for this collector manager
private HttpServer2 timelineRestServer; private HttpServer2 timelineRestServer;
@ -73,10 +63,6 @@ public class NodeTimelineCollectorManager extends TimelineCollectorManager {
static final String COLLECTOR_MANAGER_ATTR_KEY = "collector.manager"; static final String COLLECTOR_MANAGER_ATTR_KEY = "collector.manager";
static NodeTimelineCollectorManager getInstance() {
return INSTANCE;
}
@VisibleForTesting @VisibleForTesting
protected NodeTimelineCollectorManager() { protected NodeTimelineCollectorManager() {
super(NodeTimelineCollectorManager.class.getName()); super(NodeTimelineCollectorManager.class.getName());

View File

@ -56,8 +56,7 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
private final NodeTimelineCollectorManager collectorManager; private final NodeTimelineCollectorManager collectorManager;
public PerNodeTimelineCollectorsAuxService() { public PerNodeTimelineCollectorsAuxService() {
// use the same singleton this(new NodeTimelineCollectorManager());
this(NodeTimelineCollectorManager.getInstance());
} }
@VisibleForTesting PerNodeTimelineCollectorsAuxService( @VisibleForTesting PerNodeTimelineCollectorsAuxService(

View File

@ -128,7 +128,7 @@ public abstract class TimelineCollectorManager extends AbstractService {
postRemove(appId, collector); postRemove(appId, collector);
// stop the service to do clean up // stop the service to do clean up
collector.stop(); collector.stop();
LOG.info("the collector service for " + appId + " was removed"); LOG.info("The collector service for " + appId + " was removed");
} }
return collector != null; return collector != null;
} }