YARN-3136. getTransferredContainers can be a bottleneck during AM registration. (Sunil G via jianhe)

This commit is contained in:
Wangda Tan 2015-11-03 12:59:06 -08:00
parent ece1b1645b
commit 04abe7e9e0
4 changed files with 51 additions and 24 deletions

View File

@ -115,6 +115,9 @@ Release 2.7.2 - UNRELEASED
YARN-4321. Incessant retries if NoAuthException is thrown by Zookeeper in non
HA mode. (Varun Saxena via jianhe)
YARN-3136. getTransferredContainers can be a bottleneck during AM registration.
(Sunil G via jianhe)
Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES

View File

@ -462,4 +462,12 @@
<Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE" />
</Match>
<Match>
<Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler" />
<Or>
<Field name="rmContext" />
<Field name="applications" />
</Or>
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match>
</FindBugsFilter>

View File

@ -298,32 +298,35 @@ public class ApplicationMasterService extends AbstractService implements
// For work-preserving AM restart, retrieve previous attempts' containers
// and corresponding NM tokens.
List<Container> transferredContainers =
((AbstractYarnScheduler) rScheduler)
if (app.getApplicationSubmissionContext()
.getKeepContainersAcrossApplicationAttempts()) {
List<Container> transferredContainers = ((AbstractYarnScheduler) rScheduler)
.getTransferredContainers(applicationAttemptId);
if (!transferredContainers.isEmpty()) {
response.setContainersFromPreviousAttempts(transferredContainers);
List<NMToken> nmTokens = new ArrayList<NMToken>();
for (Container container : transferredContainers) {
try {
NMToken token = rmContext.getNMTokenSecretManager()
.createAndGetNMToken(app.getUser(), applicationAttemptId,
container);
if (null != token) {
nmTokens.add(token);
}
} catch (IllegalArgumentException e) {
// if it's a DNS issue, throw UnknowHostException directly and that
// will be automatically retried by RMProxy in RPC layer.
if (e.getCause() instanceof UnknownHostException) {
throw (UnknownHostException) e.getCause();
if (!transferredContainers.isEmpty()) {
response.setContainersFromPreviousAttempts(transferredContainers);
List<NMToken> nmTokens = new ArrayList<NMToken>();
for (Container container : transferredContainers) {
try {
NMToken token = rmContext.getNMTokenSecretManager()
.createAndGetNMToken(app.getUser(), applicationAttemptId,
container);
if (null != token) {
nmTokens.add(token);
}
} catch (IllegalArgumentException e) {
// if it's a DNS issue, throw UnknowHostException directly and
// that
// will be automatically retried by RMProxy in RPC layer.
if (e.getCause() instanceof UnknownHostException) {
throw (UnknownHostException) e.getCause();
}
}
}
response.setNMTokensFromPreviousAttempts(nmTokens);
LOG.info("Application " + appID + " retrieved "
+ transferredContainers.size() + " containers from previous"
+ " attempts and " + nmTokens.size() + " NM tokens.");
}
response.setNMTokensFromPreviousAttempts(nmTokens);
LOG.info("Application " + appID + " retrieved "
+ transferredContainers.size() + " containers from previous"
+ " attempts and " + nmTokens.size() + " NM tokens.");
}
response.setSchedulerResourceTypes(rScheduler

View File

@ -21,12 +21,15 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
@ -67,6 +70,8 @@ import com.google.common.util.concurrent.SettableFuture;
@SuppressWarnings("unchecked")
@Private
@Unstable
public abstract class AbstractYarnScheduler
<T extends SchedulerApplicationAttempt, N extends SchedulerNode>
extends AbstractService implements ResourceScheduler {
@ -91,7 +96,12 @@ public abstract class AbstractYarnScheduler
private long configuredMaximumAllocationWaitTime;
protected RMContext rmContext;
protected Map<ApplicationId, SchedulerApplication<T>> applications;
/*
* All schedulers which are inheriting AbstractYarnScheduler should use
* concurrent version of 'applications' map.
*/
protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications;
protected int nmExpireInterval;
protected final static List<Container> EMPTY_CONTAINER_LIST =
@ -123,7 +133,7 @@ public abstract class AbstractYarnScheduler
super.serviceInit(conf);
}
public synchronized List<Container> getTransferredContainers(
public List<Container> getTransferredContainers(
ApplicationAttemptId currentAttempt) {
ApplicationId appId = currentAttempt.getApplicationId();
SchedulerApplication<T> app = applications.get(appId);
@ -132,6 +142,9 @@ public abstract class AbstractYarnScheduler
if (appImpl.getApplicationSubmissionContext().getUnmanagedAM()) {
return containerList;
}
if (app == null) {
return containerList;
}
Collection<RMContainer> liveContainers =
app.getCurrentAppAttempt().getLiveContainers();
ContainerId amContainerId =