YARN-3136. getTransferredContainers can be a bottleneck during AM registration. (Sunil G via jianhe)
This commit is contained in:
parent
ece1b1645b
commit
04abe7e9e0
|
@ -115,6 +115,9 @@ Release 2.7.2 - UNRELEASED
|
|||
YARN-4321. Incessant retries if NoAuthException is thrown by Zookeeper in non
|
||||
HA mode. (Varun Saxena via jianhe)
|
||||
|
||||
YARN-3136. getTransferredContainers can be a bottleneck during AM registration.
|
||||
(Sunil G via jianhe)
|
||||
|
||||
Release 2.7.1 - 2015-07-06
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -462,4 +462,12 @@
|
|||
<Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE" />
|
||||
</Match>
|
||||
|
||||
<Match>
|
||||
<Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler" />
|
||||
<Or>
|
||||
<Field name="rmContext" />
|
||||
<Field name="applications" />
|
||||
</Or>
|
||||
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
||||
</Match>
|
||||
</FindBugsFilter>
|
||||
|
|
|
@ -298,32 +298,35 @@ public class ApplicationMasterService extends AbstractService implements
|
|||
|
||||
// For work-preserving AM restart, retrieve previous attempts' containers
|
||||
// and corresponding NM tokens.
|
||||
List<Container> transferredContainers =
|
||||
((AbstractYarnScheduler) rScheduler)
|
||||
if (app.getApplicationSubmissionContext()
|
||||
.getKeepContainersAcrossApplicationAttempts()) {
|
||||
List<Container> transferredContainers = ((AbstractYarnScheduler) rScheduler)
|
||||
.getTransferredContainers(applicationAttemptId);
|
||||
if (!transferredContainers.isEmpty()) {
|
||||
response.setContainersFromPreviousAttempts(transferredContainers);
|
||||
List<NMToken> nmTokens = new ArrayList<NMToken>();
|
||||
for (Container container : transferredContainers) {
|
||||
try {
|
||||
NMToken token = rmContext.getNMTokenSecretManager()
|
||||
.createAndGetNMToken(app.getUser(), applicationAttemptId,
|
||||
container);
|
||||
if (null != token) {
|
||||
nmTokens.add(token);
|
||||
}
|
||||
} catch (IllegalArgumentException e) {
|
||||
// if it's a DNS issue, throw UnknowHostException directly and that
|
||||
// will be automatically retried by RMProxy in RPC layer.
|
||||
if (e.getCause() instanceof UnknownHostException) {
|
||||
throw (UnknownHostException) e.getCause();
|
||||
if (!transferredContainers.isEmpty()) {
|
||||
response.setContainersFromPreviousAttempts(transferredContainers);
|
||||
List<NMToken> nmTokens = new ArrayList<NMToken>();
|
||||
for (Container container : transferredContainers) {
|
||||
try {
|
||||
NMToken token = rmContext.getNMTokenSecretManager()
|
||||
.createAndGetNMToken(app.getUser(), applicationAttemptId,
|
||||
container);
|
||||
if (null != token) {
|
||||
nmTokens.add(token);
|
||||
}
|
||||
} catch (IllegalArgumentException e) {
|
||||
// if it's a DNS issue, throw UnknowHostException directly and
|
||||
// that
|
||||
// will be automatically retried by RMProxy in RPC layer.
|
||||
if (e.getCause() instanceof UnknownHostException) {
|
||||
throw (UnknownHostException) e.getCause();
|
||||
}
|
||||
}
|
||||
}
|
||||
response.setNMTokensFromPreviousAttempts(nmTokens);
|
||||
LOG.info("Application " + appID + " retrieved "
|
||||
+ transferredContainers.size() + " containers from previous"
|
||||
+ " attempts and " + nmTokens.size() + " NM tokens.");
|
||||
}
|
||||
response.setNMTokensFromPreviousAttempts(nmTokens);
|
||||
LOG.info("Application " + appID + " retrieved "
|
||||
+ transferredContainers.size() + " containers from previous"
|
||||
+ " attempts and " + nmTokens.size() + " NM tokens.");
|
||||
}
|
||||
|
||||
response.setSchedulerResourceTypes(rScheduler
|
||||
|
|
|
@ -21,12 +21,15 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler;
|
|||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
|
@ -67,6 +70,8 @@ import com.google.common.util.concurrent.SettableFuture;
|
|||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Private
|
||||
@Unstable
|
||||
public abstract class AbstractYarnScheduler
|
||||
<T extends SchedulerApplicationAttempt, N extends SchedulerNode>
|
||||
extends AbstractService implements ResourceScheduler {
|
||||
|
@ -91,7 +96,12 @@ public abstract class AbstractYarnScheduler
|
|||
private long configuredMaximumAllocationWaitTime;
|
||||
|
||||
protected RMContext rmContext;
|
||||
protected Map<ApplicationId, SchedulerApplication<T>> applications;
|
||||
|
||||
/*
|
||||
* All schedulers which are inheriting AbstractYarnScheduler should use
|
||||
* concurrent version of 'applications' map.
|
||||
*/
|
||||
protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications;
|
||||
protected int nmExpireInterval;
|
||||
|
||||
protected final static List<Container> EMPTY_CONTAINER_LIST =
|
||||
|
@ -123,7 +133,7 @@ public abstract class AbstractYarnScheduler
|
|||
super.serviceInit(conf);
|
||||
}
|
||||
|
||||
public synchronized List<Container> getTransferredContainers(
|
||||
public List<Container> getTransferredContainers(
|
||||
ApplicationAttemptId currentAttempt) {
|
||||
ApplicationId appId = currentAttempt.getApplicationId();
|
||||
SchedulerApplication<T> app = applications.get(appId);
|
||||
|
@ -132,6 +142,9 @@ public abstract class AbstractYarnScheduler
|
|||
if (appImpl.getApplicationSubmissionContext().getUnmanagedAM()) {
|
||||
return containerList;
|
||||
}
|
||||
if (app == null) {
|
||||
return containerList;
|
||||
}
|
||||
Collection<RMContainer> liveContainers =
|
||||
app.getCurrentAppAttempt().getLiveContainers();
|
||||
ContainerId amContainerId =
|
||||
|
|
Loading…
Reference in New Issue