YARN-6128. Add support for AMRMProxy HA. (Botong Huang via Subru).
This commit is contained in:
parent
ea8a121423
commit
ed31091361
|
@ -1948,6 +1948,9 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String DEFAULT_AMRM_PROXY_INTERCEPTOR_CLASS_PIPELINE =
|
public static final String DEFAULT_AMRM_PROXY_INTERCEPTOR_CLASS_PIPELINE =
|
||||||
"org.apache.hadoop.yarn.server.nodemanager.amrmproxy."
|
"org.apache.hadoop.yarn.server.nodemanager.amrmproxy."
|
||||||
+ "DefaultRequestInterceptor";
|
+ "DefaultRequestInterceptor";
|
||||||
|
public static final String AMRM_PROXY_HA_ENABLED = NM_PREFIX
|
||||||
|
+ "amrmproxy.ha.enable";
|
||||||
|
public static final boolean DEFAULT_AMRM_PROXY_HA_ENABLED = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default platform-agnostic CLASSPATH for YARN applications. A
|
* Default platform-agnostic CLASSPATH for YARN applications. A
|
||||||
|
@ -2790,6 +2793,11 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String FEDERATION_CACHE_TIME_TO_LIVE_SECS =
|
public static final String FEDERATION_CACHE_TIME_TO_LIVE_SECS =
|
||||||
FEDERATION_PREFIX + "cache-ttl.secs";
|
FEDERATION_PREFIX + "cache-ttl.secs";
|
||||||
|
|
||||||
|
public static final String FEDERATION_REGISTRY_BASE_KEY =
|
||||||
|
FEDERATION_PREFIX + "registry.base-dir";
|
||||||
|
public static final String DEFAULT_FEDERATION_REGISTRY_BASE_KEY =
|
||||||
|
"yarnfederation/";
|
||||||
|
|
||||||
// 5 minutes
|
// 5 minutes
|
||||||
public static final int DEFAULT_FEDERATION_CACHE_TIME_TO_LIVE_SECS = 5 * 60;
|
public static final int DEFAULT_FEDERATION_CACHE_TIME_TO_LIVE_SECS = 5 * 60;
|
||||||
|
|
||||||
|
@ -2947,6 +2955,11 @@ public class YarnConfiguration extends Configuration {
|
||||||
// Other Configs
|
// Other Configs
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
|
|
||||||
|
public static final String YARN_REGISTRY_CLASS =
|
||||||
|
YARN_PREFIX + "registry.class";
|
||||||
|
public static final String DEFAULT_YARN_REGISTRY_CLASS =
|
||||||
|
"org.apache.hadoop.registry.client.impl.FSRegistryOperationsService";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use YARN_CLIENT_APPLICATION_CLIENT_PROTOCOL_POLL_INTERVAL_MS instead.
|
* Use YARN_CLIENT_APPLICATION_CLIENT_PROTOCOL_POLL_INTERVAL_MS instead.
|
||||||
* The interval of the yarn client's querying application state after
|
* The interval of the yarn client's querying application state after
|
||||||
|
|
|
@ -2815,7 +2815,20 @@
|
||||||
<value>300</value>
|
<value>300</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>The registry base directory for federation.</description>
|
||||||
|
<name>yarn.federation.registry.base-dir</name>
|
||||||
|
<value>yarnfederation/</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<!-- Other Configuration -->
|
<!-- Other Configuration -->
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>The registry implementation to use.</description>
|
||||||
|
<name>yarn.registry.class</name>
|
||||||
|
<value>org.apache.hadoop.registry.client.impl.FSRegistryOperationsService</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>The interval that the yarn client library uses to poll the
|
<description>The interval that the yarn client library uses to poll the
|
||||||
completion status of the asynchronous API of application client protocol.
|
completion status of the asynchronous API of application client protocol.
|
||||||
|
@ -2976,6 +2989,14 @@
|
||||||
<value>org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor</value>
|
<value>org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>
|
||||||
|
Whether AMRMProxy HA is enabled.
|
||||||
|
</description>
|
||||||
|
<name>yarn.nodemanager.amrmproxy.ha.enable</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>
|
<description>
|
||||||
Setting that controls whether distributed scheduling is enabled.
|
Setting that controls whether distributed scheduling is enabled.
|
||||||
|
|
|
@ -67,6 +67,11 @@
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-registry</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
|
|
|
@ -0,0 +1,338 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.federation.utils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.security.PrivilegedAction;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.registry.client.api.BindFlags;
|
||||||
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
|
import org.apache.hadoop.registry.client.types.ServiceRecord;
|
||||||
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
import org.apache.hadoop.security.token.Token;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class that handles reads and writes to Yarn Registry to support UAM HA
|
||||||
|
* and second attempt.
|
||||||
|
*/
|
||||||
|
public class FederationRegistryClient {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(FederationRegistryClient.class);
|
||||||
|
|
||||||
|
private RegistryOperations registry;
|
||||||
|
|
||||||
|
private UserGroupInformation user;
|
||||||
|
|
||||||
|
// AppId -> SubClusterId -> UAM token
|
||||||
|
private Map<ApplicationId, Map<String, Token<AMRMTokenIdentifier>>>
|
||||||
|
appSubClusterTokenMap;
|
||||||
|
|
||||||
|
// Structure in registry: <registryBaseDir>/<AppId>/<SubClusterId> -> UAMToken
|
||||||
|
private String registryBaseDir;
|
||||||
|
|
||||||
|
public FederationRegistryClient(Configuration conf,
|
||||||
|
RegistryOperations registry, UserGroupInformation user) {
|
||||||
|
this.registry = registry;
|
||||||
|
this.user = user;
|
||||||
|
this.appSubClusterTokenMap = new ConcurrentHashMap<>();
|
||||||
|
this.registryBaseDir =
|
||||||
|
conf.get(YarnConfiguration.FEDERATION_REGISTRY_BASE_KEY,
|
||||||
|
YarnConfiguration.DEFAULT_FEDERATION_REGISTRY_BASE_KEY);
|
||||||
|
LOG.info("Using registry {} with base directory: {}",
|
||||||
|
this.registry.getClass().getName(), this.registryBaseDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the list of known applications in the registry.
|
||||||
|
*
|
||||||
|
* @return the list of known applications
|
||||||
|
*/
|
||||||
|
public List<String> getAllApplications() {
|
||||||
|
// Suppress the exception here because it is valid that the entry does not
|
||||||
|
// exist
|
||||||
|
List<String> applications = null;
|
||||||
|
try {
|
||||||
|
applications = listDirRegistry(this.registry, this.user,
|
||||||
|
getRegistryKey(null, null), false);
|
||||||
|
} catch (YarnException e) {
|
||||||
|
LOG.warn("Unexpected exception from listDirRegistry", e);
|
||||||
|
}
|
||||||
|
if (applications == null) {
|
||||||
|
// It is valid for listDirRegistry to return null
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
return applications;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For testing, delete all application records in registry.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
public void cleanAllApplications() {
|
||||||
|
try {
|
||||||
|
removeKeyRegistry(this.registry, this.user, getRegistryKey(null, null),
|
||||||
|
true, false);
|
||||||
|
} catch (YarnException e) {
|
||||||
|
LOG.warn("Unexpected exception from removeKeyRegistry", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write/update the UAM token for an application and a sub-cluster.
|
||||||
|
*
|
||||||
|
* @param subClusterId sub-cluster id of the token
|
||||||
|
* @param token the UAM of the application
|
||||||
|
* @return whether the amrmToken is added or updated to a new value
|
||||||
|
*/
|
||||||
|
public boolean writeAMRMTokenForUAM(ApplicationId appId,
|
||||||
|
String subClusterId, Token<AMRMTokenIdentifier> token) {
|
||||||
|
Map<String, Token<AMRMTokenIdentifier>> subClusterTokenMap =
|
||||||
|
this.appSubClusterTokenMap.get(appId);
|
||||||
|
if (subClusterTokenMap == null) {
|
||||||
|
subClusterTokenMap = new ConcurrentHashMap<>();
|
||||||
|
this.appSubClusterTokenMap.put(appId, subClusterTokenMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean update = !token.equals(subClusterTokenMap.get(subClusterId));
|
||||||
|
if (!update) {
|
||||||
|
LOG.debug("Same amrmToken received from {}, skip writing registry for {}",
|
||||||
|
subClusterId, appId);
|
||||||
|
return update;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG.info("Writing/Updating amrmToken for {} to registry for {}",
|
||||||
|
subClusterId, appId);
|
||||||
|
try {
|
||||||
|
// First, write the token entry
|
||||||
|
writeRegistry(this.registry, this.user,
|
||||||
|
getRegistryKey(appId, subClusterId), token.encodeToUrlString(), true);
|
||||||
|
|
||||||
|
// Then update the subClusterTokenMap
|
||||||
|
subClusterTokenMap.put(subClusterId, token);
|
||||||
|
} catch (YarnException | IOException e) {
|
||||||
|
LOG.error(
|
||||||
|
"Failed writing AMRMToken to registry for subcluster " + subClusterId,
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
return update;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load the information of one application from registry.
|
||||||
|
*
|
||||||
|
* @param appId application id
|
||||||
|
* @return the sub-cluster to UAM token mapping
|
||||||
|
*/
|
||||||
|
public Map<String, Token<AMRMTokenIdentifier>>
|
||||||
|
loadStateFromRegistry(ApplicationId appId) {
|
||||||
|
Map<String, Token<AMRMTokenIdentifier>> retMap = new HashMap<>();
|
||||||
|
// Suppress the exception here because it is valid that the entry does not
|
||||||
|
// exist
|
||||||
|
List<String> subclusters = null;
|
||||||
|
try {
|
||||||
|
subclusters = listDirRegistry(this.registry, this.user,
|
||||||
|
getRegistryKey(appId, null), false);
|
||||||
|
} catch (YarnException e) {
|
||||||
|
LOG.warn("Unexpected exception from listDirRegistry", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (subclusters == null) {
|
||||||
|
LOG.info("Application {} does not exist in registry", appId);
|
||||||
|
return retMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the amrmToken for each sub-cluster with an existing UAM
|
||||||
|
for (String scId : subclusters) {
|
||||||
|
LOG.info("Reading amrmToken for subcluster {} for {}", scId, appId);
|
||||||
|
String key = getRegistryKey(appId, scId);
|
||||||
|
try {
|
||||||
|
String tokenString = readRegistry(this.registry, this.user, key, true);
|
||||||
|
if (tokenString == null) {
|
||||||
|
throw new YarnException("Null string from readRegistry key " + key);
|
||||||
|
}
|
||||||
|
Token<AMRMTokenIdentifier> amrmToken = new Token<>();
|
||||||
|
amrmToken.decodeFromUrlString(tokenString);
|
||||||
|
// Clear the service field, as if RM just issued the token
|
||||||
|
amrmToken.setService(new Text());
|
||||||
|
|
||||||
|
retMap.put(scId, amrmToken);
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.error("Failed reading registry key " + key
|
||||||
|
+ ", skipping subcluster " + scId, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Override existing map if there
|
||||||
|
this.appSubClusterTokenMap.put(appId, new ConcurrentHashMap<>(retMap));
|
||||||
|
return retMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove an application from registry.
|
||||||
|
*
|
||||||
|
* @param appId application id
|
||||||
|
*/
|
||||||
|
public void removeAppFromRegistry(ApplicationId appId) {
|
||||||
|
Map<String, Token<AMRMTokenIdentifier>> subClusterTokenMap =
|
||||||
|
this.appSubClusterTokenMap.get(appId);
|
||||||
|
LOG.info("Removing all registry entries for {}", appId);
|
||||||
|
|
||||||
|
if (subClusterTokenMap == null || subClusterTokenMap.size() == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lastly remove the application directory
|
||||||
|
String key = getRegistryKey(appId, null);
|
||||||
|
try {
|
||||||
|
removeKeyRegistry(this.registry, this.user, key, true, true);
|
||||||
|
subClusterTokenMap.clear();
|
||||||
|
} catch (YarnException e) {
|
||||||
|
LOG.error("Failed removing registry directory key " + key, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getRegistryKey(ApplicationId appId, String fileName) {
|
||||||
|
if (appId == null) {
|
||||||
|
return this.registryBaseDir;
|
||||||
|
}
|
||||||
|
if (fileName == null) {
|
||||||
|
return this.registryBaseDir + appId.toString();
|
||||||
|
}
|
||||||
|
return this.registryBaseDir + appId.toString() + "/" + fileName;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String readRegistry(final RegistryOperations registryImpl,
|
||||||
|
UserGroupInformation ugi, final String key, final boolean throwIfFails)
|
||||||
|
throws YarnException {
|
||||||
|
// Use the ugi loaded with app credentials to access registry
|
||||||
|
String result = ugi.doAs(new PrivilegedAction<String>() {
|
||||||
|
@Override
|
||||||
|
public String run() {
|
||||||
|
try {
|
||||||
|
ServiceRecord value = registryImpl.resolve(key);
|
||||||
|
if (value != null) {
|
||||||
|
return value.description;
|
||||||
|
}
|
||||||
|
} catch (Throwable e) {
|
||||||
|
if (throwIfFails) {
|
||||||
|
LOG.error("Registry resolve key " + key + " failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (result == null && throwIfFails) {
|
||||||
|
throw new YarnException("Registry resolve key " + key + " failed");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removeKeyRegistry(final RegistryOperations registryImpl,
|
||||||
|
UserGroupInformation ugi, final String key, final boolean recursive,
|
||||||
|
final boolean throwIfFails) throws YarnException {
|
||||||
|
// Use the ugi loaded with app credentials to access registry
|
||||||
|
boolean success = ugi.doAs(new PrivilegedAction<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean run() {
|
||||||
|
try {
|
||||||
|
registryImpl.delete(key, recursive);
|
||||||
|
return true;
|
||||||
|
} catch (Throwable e) {
|
||||||
|
if (throwIfFails) {
|
||||||
|
LOG.error("Registry remove key " + key + " failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!success && throwIfFails) {
|
||||||
|
throw new YarnException("Registry remove key " + key + " failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write registry entry, override if exists.
|
||||||
|
*/
|
||||||
|
private void writeRegistry(final RegistryOperations registryImpl,
|
||||||
|
UserGroupInformation ugi, final String key, final String value,
|
||||||
|
final boolean throwIfFails) throws YarnException {
|
||||||
|
|
||||||
|
final ServiceRecord recordValue = new ServiceRecord();
|
||||||
|
recordValue.description = value;
|
||||||
|
// Use the ugi loaded with app credentials to access registry
|
||||||
|
boolean success = ugi.doAs(new PrivilegedAction<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean run() {
|
||||||
|
try {
|
||||||
|
registryImpl.bind(key, recordValue, BindFlags.OVERWRITE);
|
||||||
|
return true;
|
||||||
|
} catch (Throwable e) {
|
||||||
|
if (throwIfFails) {
|
||||||
|
LOG.error("Registry write key " + key + " failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!success && throwIfFails) {
|
||||||
|
throw new YarnException("Registry write key " + key + " failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List the sub directories in the given directory.
|
||||||
|
*/
|
||||||
|
private List<String> listDirRegistry(final RegistryOperations registryImpl,
|
||||||
|
UserGroupInformation ugi, final String key, final boolean throwIfFails)
|
||||||
|
throws YarnException {
|
||||||
|
List<String> result = ugi.doAs(new PrivilegedAction<List<String>>() {
|
||||||
|
@Override
|
||||||
|
public List<String> run() {
|
||||||
|
try {
|
||||||
|
return registryImpl.list(key);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
if (throwIfFails) {
|
||||||
|
LOG.error("Registry list key " + key + " failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (result == null && throwIfFails) {
|
||||||
|
throw new YarnException("Registry list key " + key + " failed");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -33,6 +33,7 @@ import org.apache.hadoop.classification.InterfaceAudience.Public;
|
||||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
import org.apache.hadoop.security.token.Token;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
import org.apache.hadoop.yarn.api.ApplicationClientProtocol;
|
import org.apache.hadoop.yarn.api.ApplicationClientProtocol;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||||
|
@ -44,9 +45,9 @@ import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.KillApplicationResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.KillApplicationResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.server.utils.AMRMClientUtils;
|
import org.apache.hadoop.yarn.server.utils.AMRMClientUtils;
|
||||||
import org.apache.hadoop.yarn.util.AsyncCallback;
|
import org.apache.hadoop.yarn.util.AsyncCallback;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -67,7 +68,7 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
// Map from uamId to UAM instances
|
// Map from uamId to UAM instances
|
||||||
private Map<String, UnmanagedApplicationManager> unmanagedAppMasterMap;
|
private Map<String, UnmanagedApplicationManager> unmanagedAppMasterMap;
|
||||||
|
|
||||||
private Map<String, ApplicationAttemptId> attemptIdMap;
|
private Map<String, ApplicationId> appIdMap;
|
||||||
|
|
||||||
private ExecutorService threadpool;
|
private ExecutorService threadpool;
|
||||||
|
|
||||||
|
@ -82,7 +83,7 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
this.threadpool = Executors.newCachedThreadPool();
|
this.threadpool = Executors.newCachedThreadPool();
|
||||||
}
|
}
|
||||||
this.unmanagedAppMasterMap = new ConcurrentHashMap<>();
|
this.unmanagedAppMasterMap = new ConcurrentHashMap<>();
|
||||||
this.attemptIdMap = new ConcurrentHashMap<>();
|
this.appIdMap = new ConcurrentHashMap<>();
|
||||||
super.serviceStart();
|
super.serviceStart();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,7 +115,7 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
public KillApplicationResponse call() throws Exception {
|
public KillApplicationResponse call() throws Exception {
|
||||||
try {
|
try {
|
||||||
LOG.info("Force-killing UAM id " + uamId + " for application "
|
LOG.info("Force-killing UAM id " + uamId + " for application "
|
||||||
+ attemptIdMap.get(uamId));
|
+ appIdMap.get(uamId));
|
||||||
return unmanagedAppMasterMap.remove(uamId).forceKillApplication();
|
return unmanagedAppMasterMap.remove(uamId).forceKillApplication();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
LOG.error("Failed to kill unmanaged application master", e);
|
LOG.error("Failed to kill unmanaged application master", e);
|
||||||
|
@ -132,7 +133,7 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
LOG.error("Failed to kill unmanaged application master", e);
|
LOG.error("Failed to kill unmanaged application master", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.attemptIdMap.clear();
|
this.appIdMap.clear();
|
||||||
super.serviceStop();
|
super.serviceStop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,13 +146,18 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
* @param queueName queue of the application
|
* @param queueName queue of the application
|
||||||
* @param submitter submitter name of the UAM
|
* @param submitter submitter name of the UAM
|
||||||
* @param appNameSuffix application name suffix for the UAM
|
* @param appNameSuffix application name suffix for the UAM
|
||||||
|
* @param keepContainersAcrossApplicationAttempts keep container flag for UAM
|
||||||
|
* recovery.
|
||||||
|
* @see ApplicationSubmissionContext
|
||||||
|
* #setKeepContainersAcrossApplicationAttempts(boolean)
|
||||||
* @return uamId for the UAM
|
* @return uamId for the UAM
|
||||||
* @throws YarnException if registerApplicationMaster fails
|
* @throws YarnException if registerApplicationMaster fails
|
||||||
* @throws IOException if registerApplicationMaster fails
|
* @throws IOException if registerApplicationMaster fails
|
||||||
*/
|
*/
|
||||||
public String createAndRegisterNewUAM(
|
public String createAndRegisterNewUAM(
|
||||||
RegisterApplicationMasterRequest registerRequest, Configuration conf,
|
RegisterApplicationMasterRequest registerRequest, Configuration conf,
|
||||||
String queueName, String submitter, String appNameSuffix)
|
String queueName, String submitter, String appNameSuffix,
|
||||||
|
boolean keepContainersAcrossApplicationAttempts)
|
||||||
throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
ApplicationId appId = null;
|
ApplicationId appId = null;
|
||||||
ApplicationClientProtocol rmClient;
|
ApplicationClientProtocol rmClient;
|
||||||
|
@ -173,45 +179,52 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
rmClient = null;
|
rmClient = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
createAndRegisterNewUAM(appId.toString(), registerRequest, conf, appId,
|
// Launch the UAM in RM
|
||||||
queueName, submitter, appNameSuffix);
|
launchUAM(appId.toString(), conf, appId, queueName, submitter,
|
||||||
|
appNameSuffix, keepContainersAcrossApplicationAttempts);
|
||||||
|
|
||||||
|
// Register the UAM application
|
||||||
|
registerApplicationMaster(appId.toString(), registerRequest);
|
||||||
|
|
||||||
|
// Returns the appId as uamId
|
||||||
return appId.toString();
|
return appId.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new UAM and register the application, using the provided uamId and
|
* Launch a new UAM, using the provided uamId and appId.
|
||||||
* appId.
|
|
||||||
*
|
*
|
||||||
* @param uamId identifier for the UAM
|
* @param uamId uam Id
|
||||||
* @param registerRequest RegisterApplicationMasterRequest
|
|
||||||
* @param conf configuration for this UAM
|
* @param conf configuration for this UAM
|
||||||
* @param appId application id for the UAM
|
* @param appId application id for the UAM
|
||||||
* @param queueName queue of the application
|
* @param queueName queue of the application
|
||||||
* @param submitter submitter name of the UAM
|
* @param submitter submitter name of the UAM
|
||||||
* @param appNameSuffix application name suffix for the UAM
|
* @param appNameSuffix application name suffix for the UAM
|
||||||
* @return RegisterApplicationMasterResponse
|
* @param keepContainersAcrossApplicationAttempts keep container flag for UAM
|
||||||
* @throws YarnException if registerApplicationMaster fails
|
* recovery.
|
||||||
* @throws IOException if registerApplicationMaster fails
|
* @see ApplicationSubmissionContext
|
||||||
|
* #setKeepContainersAcrossApplicationAttempts(boolean)
|
||||||
|
* @return UAM token
|
||||||
|
* @throws YarnException if fails
|
||||||
|
* @throws IOException if fails
|
||||||
*/
|
*/
|
||||||
public RegisterApplicationMasterResponse createAndRegisterNewUAM(String uamId,
|
public Token<AMRMTokenIdentifier> launchUAM(String uamId, Configuration conf,
|
||||||
RegisterApplicationMasterRequest registerRequest, Configuration conf,
|
|
||||||
ApplicationId appId, String queueName, String submitter,
|
ApplicationId appId, String queueName, String submitter,
|
||||||
String appNameSuffix) throws YarnException, IOException {
|
String appNameSuffix, boolean keepContainersAcrossApplicationAttempts)
|
||||||
|
throws YarnException, IOException {
|
||||||
|
|
||||||
if (this.unmanagedAppMasterMap.containsKey(uamId)) {
|
if (this.unmanagedAppMasterMap.containsKey(uamId)) {
|
||||||
throw new YarnException("UAM " + uamId + " already exists");
|
throw new YarnException("UAM " + uamId + " already exists");
|
||||||
}
|
}
|
||||||
UnmanagedApplicationManager uam =
|
UnmanagedApplicationManager uam = createUAM(conf, appId, queueName,
|
||||||
createUAM(conf, appId, queueName, submitter, appNameSuffix);
|
submitter, appNameSuffix, keepContainersAcrossApplicationAttempts);
|
||||||
// Put the UAM into map first before initializing it to avoid additional UAM
|
// Put the UAM into map first before initializing it to avoid additional UAM
|
||||||
// for the same uamId being created concurrently
|
// for the same uamId being created concurrently
|
||||||
this.unmanagedAppMasterMap.put(uamId, uam);
|
this.unmanagedAppMasterMap.put(uamId, uam);
|
||||||
|
|
||||||
RegisterApplicationMasterResponse response = null;
|
Token<AMRMTokenIdentifier> amrmToken = null;
|
||||||
try {
|
try {
|
||||||
LOG.info("Creating and registering UAM id {} for application {}", uamId,
|
LOG.info("Launching UAM id {} for application {}", uamId, appId);
|
||||||
appId);
|
amrmToken = uam.launchUAM();
|
||||||
response = uam.createAndRegisterApplicationMaster(registerRequest);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// Add the map earlier and remove here if register failed because we want
|
// Add the map earlier and remove here if register failed because we want
|
||||||
// to make sure there is only one uam instance per uamId at any given time
|
// to make sure there is only one uam instance per uamId at any given time
|
||||||
|
@ -219,8 +232,48 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.attemptIdMap.put(uamId, uam.getAttemptId());
|
this.appIdMap.put(uamId, uam.getAppId());
|
||||||
return response;
|
return amrmToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Re-attach to an existing UAM, using the provided uamIdentifier.
|
||||||
|
*
|
||||||
|
* @param uamId uam Id
|
||||||
|
* @param conf configuration for this UAM
|
||||||
|
* @param appId application id for the UAM
|
||||||
|
* @param queueName queue of the application
|
||||||
|
* @param submitter submitter name of the UAM
|
||||||
|
* @param appNameSuffix application name suffix for the UAM
|
||||||
|
* @param uamToken UAM token
|
||||||
|
* @throws YarnException if fails
|
||||||
|
* @throws IOException if fails
|
||||||
|
*/
|
||||||
|
public void reAttachUAM(String uamId, Configuration conf,
|
||||||
|
ApplicationId appId, String queueName, String submitter,
|
||||||
|
String appNameSuffix, Token<AMRMTokenIdentifier> uamToken)
|
||||||
|
throws YarnException, IOException {
|
||||||
|
|
||||||
|
if (this.unmanagedAppMasterMap.containsKey(uamId)) {
|
||||||
|
throw new YarnException("UAM " + uamId + " already exists");
|
||||||
|
}
|
||||||
|
UnmanagedApplicationManager uam =
|
||||||
|
createUAM(conf, appId, queueName, submitter, appNameSuffix, true);
|
||||||
|
// Put the UAM into map first before initializing it to avoid additional UAM
|
||||||
|
// for the same uamId being created concurrently
|
||||||
|
this.unmanagedAppMasterMap.put(uamId, uam);
|
||||||
|
|
||||||
|
try {
|
||||||
|
LOG.info("Reattaching UAM id {} for application {}", uamId, appId);
|
||||||
|
uam.reAttachUAM(uamToken);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Add the map earlier and remove here if register failed because we want
|
||||||
|
// to make sure there is only one uam instance per uamId at any given time
|
||||||
|
this.unmanagedAppMasterMap.remove(uamId);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.appIdMap.put(uamId, uam.getAppId());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -231,20 +284,42 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
* @param queueName queue of the application
|
* @param queueName queue of the application
|
||||||
* @param submitter submitter name of the application
|
* @param submitter submitter name of the application
|
||||||
* @param appNameSuffix application name suffix
|
* @param appNameSuffix application name suffix
|
||||||
|
* @param keepContainersAcrossApplicationAttempts keep container flag for UAM
|
||||||
* @return the UAM instance
|
* @return the UAM instance
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected UnmanagedApplicationManager createUAM(Configuration conf,
|
protected UnmanagedApplicationManager createUAM(Configuration conf,
|
||||||
ApplicationId appId, String queueName, String submitter,
|
ApplicationId appId, String queueName, String submitter,
|
||||||
String appNameSuffix) {
|
String appNameSuffix, boolean keepContainersAcrossApplicationAttempts) {
|
||||||
return new UnmanagedApplicationManager(conf, appId, queueName, submitter,
|
return new UnmanagedApplicationManager(conf, appId, queueName, submitter,
|
||||||
appNameSuffix);
|
appNameSuffix, keepContainersAcrossApplicationAttempts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Register application master for the UAM.
|
||||||
|
*
|
||||||
|
* @param uamId uam Id
|
||||||
|
* @param registerRequest RegisterApplicationMasterRequest
|
||||||
|
* @return register response
|
||||||
|
* @throws YarnException if register fails
|
||||||
|
* @throws IOException if register fails
|
||||||
|
*/
|
||||||
|
public RegisterApplicationMasterResponse registerApplicationMaster(
|
||||||
|
String uamId, RegisterApplicationMasterRequest registerRequest)
|
||||||
|
throws YarnException, IOException {
|
||||||
|
if (!this.unmanagedAppMasterMap.containsKey(uamId)) {
|
||||||
|
throw new YarnException("UAM " + uamId + " does not exist");
|
||||||
|
}
|
||||||
|
LOG.info("Registering UAM id {} for application {}", uamId,
|
||||||
|
this.appIdMap.get(uamId));
|
||||||
|
return this.unmanagedAppMasterMap.get(uamId)
|
||||||
|
.registerApplicationMaster(registerRequest);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* AllocateAsync to an UAM.
|
* AllocateAsync to an UAM.
|
||||||
*
|
*
|
||||||
* @param uamId identifier for the UAM
|
* @param uamId uam Id
|
||||||
* @param request AllocateRequest
|
* @param request AllocateRequest
|
||||||
* @param callback callback for response
|
* @param callback callback for response
|
||||||
* @throws YarnException if allocate fails
|
* @throws YarnException if allocate fails
|
||||||
|
@ -262,7 +337,7 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
/**
|
/**
|
||||||
* Finish an UAM/application.
|
* Finish an UAM/application.
|
||||||
*
|
*
|
||||||
* @param uamId identifier for the UAM
|
* @param uamId uam Id
|
||||||
* @param request FinishApplicationMasterRequest
|
* @param request FinishApplicationMasterRequest
|
||||||
* @return FinishApplicationMasterResponse
|
* @return FinishApplicationMasterResponse
|
||||||
* @throws YarnException if finishApplicationMaster call fails
|
* @throws YarnException if finishApplicationMaster call fails
|
||||||
|
@ -274,14 +349,15 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
if (!this.unmanagedAppMasterMap.containsKey(uamId)) {
|
if (!this.unmanagedAppMasterMap.containsKey(uamId)) {
|
||||||
throw new YarnException("UAM " + uamId + " does not exist");
|
throw new YarnException("UAM " + uamId + " does not exist");
|
||||||
}
|
}
|
||||||
LOG.info("Finishing application for UAM id {} ", uamId);
|
LOG.info("Finishing UAM id {} for application {}", uamId,
|
||||||
|
this.appIdMap.get(uamId));
|
||||||
FinishApplicationMasterResponse response =
|
FinishApplicationMasterResponse response =
|
||||||
this.unmanagedAppMasterMap.get(uamId).finishApplicationMaster(request);
|
this.unmanagedAppMasterMap.get(uamId).finishApplicationMaster(request);
|
||||||
|
|
||||||
if (response.getIsUnregistered()) {
|
if (response.getIsUnregistered()) {
|
||||||
// Only remove the UAM when the unregister finished
|
// Only remove the UAM when the unregister finished
|
||||||
this.unmanagedAppMasterMap.remove(uamId);
|
this.unmanagedAppMasterMap.remove(uamId);
|
||||||
this.attemptIdMap.remove(uamId);
|
this.appIdMap.remove(uamId);
|
||||||
LOG.info("UAM id {} is unregistered", uamId);
|
LOG.info("UAM id {} is unregistered", uamId);
|
||||||
}
|
}
|
||||||
return response;
|
return response;
|
||||||
|
@ -301,7 +377,7 @@ public class UnmanagedAMPoolManager extends AbstractService {
|
||||||
/**
|
/**
|
||||||
* Return whether an UAM exists.
|
* Return whether an UAM exists.
|
||||||
*
|
*
|
||||||
* @param uamId identifier for the UAM
|
* @param uamId uam Id
|
||||||
* @return UAM exists or not
|
* @return UAM exists or not
|
||||||
*/
|
*/
|
||||||
public boolean hasUAMId(String uamId) {
|
public boolean hasUAMId(String uamId) {
|
||||||
|
|
|
@ -50,7 +50,9 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptReport;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||||
|
import org.apache.hadoop.yarn.api.records.NMToken;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.YarnApplicationAttemptState;
|
import org.apache.hadoop.yarn.api.records.YarnApplicationAttemptState;
|
||||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||||
|
@ -90,7 +92,6 @@ public class UnmanagedApplicationManager {
|
||||||
private AMRequestHandlerThread handlerThread;
|
private AMRequestHandlerThread handlerThread;
|
||||||
private ApplicationMasterProtocol rmProxy;
|
private ApplicationMasterProtocol rmProxy;
|
||||||
private ApplicationId applicationId;
|
private ApplicationId applicationId;
|
||||||
private ApplicationAttemptId attemptId;
|
|
||||||
private String submitter;
|
private String submitter;
|
||||||
private String appNameSuffix;
|
private String appNameSuffix;
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
|
@ -101,9 +102,31 @@ public class UnmanagedApplicationManager {
|
||||||
private ApplicationClientProtocol rmClient;
|
private ApplicationClientProtocol rmClient;
|
||||||
private long asyncApiPollIntervalMillis;
|
private long asyncApiPollIntervalMillis;
|
||||||
private RecordFactory recordFactory;
|
private RecordFactory recordFactory;
|
||||||
|
private boolean keepContainersAcrossApplicationAttempts;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This flag is used as an indication that this method launchUAM/reAttachUAM
|
||||||
|
* is called (and perhaps blocked in initializeUnmanagedAM below due to RM
|
||||||
|
* connection/failover issue and not finished yet). Set the flag before
|
||||||
|
* calling the blocking call to RM.
|
||||||
|
*/
|
||||||
|
private boolean connectionInitiated;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor.
|
||||||
|
*
|
||||||
|
* @param conf configuration
|
||||||
|
* @param appId application Id to use for this UAM
|
||||||
|
* @param queueName the queue of the UAM
|
||||||
|
* @param submitter user name of the app
|
||||||
|
* @param appNameSuffix the app name suffix to use
|
||||||
|
* @param keepContainersAcrossApplicationAttempts keep container flag for UAM
|
||||||
|
* recovery. See {@link ApplicationSubmissionContext
|
||||||
|
* #setKeepContainersAcrossApplicationAttempts(boolean)}
|
||||||
|
*/
|
||||||
public UnmanagedApplicationManager(Configuration conf, ApplicationId appId,
|
public UnmanagedApplicationManager(Configuration conf, ApplicationId appId,
|
||||||
String queueName, String submitter, String appNameSuffix) {
|
String queueName, String submitter, String appNameSuffix,
|
||||||
|
boolean keepContainersAcrossApplicationAttempts) {
|
||||||
Preconditions.checkNotNull(conf, "Configuration cannot be null");
|
Preconditions.checkNotNull(conf, "Configuration cannot be null");
|
||||||
Preconditions.checkNotNull(appId, "ApplicationId cannot be null");
|
Preconditions.checkNotNull(appId, "ApplicationId cannot be null");
|
||||||
Preconditions.checkNotNull(submitter, "App submitter cannot be null");
|
Preconditions.checkNotNull(submitter, "App submitter cannot be null");
|
||||||
|
@ -116,6 +139,7 @@ public class UnmanagedApplicationManager {
|
||||||
this.handlerThread = new AMRequestHandlerThread();
|
this.handlerThread = new AMRequestHandlerThread();
|
||||||
this.requestQueue = new LinkedBlockingQueue<>();
|
this.requestQueue = new LinkedBlockingQueue<>();
|
||||||
this.rmProxy = null;
|
this.rmProxy = null;
|
||||||
|
this.connectionInitiated = false;
|
||||||
this.registerRequest = null;
|
this.registerRequest = null;
|
||||||
this.recordFactory = RecordFactoryProvider.getRecordFactory(conf);
|
this.recordFactory = RecordFactoryProvider.getRecordFactory(conf);
|
||||||
this.asyncApiPollIntervalMillis = conf.getLong(
|
this.asyncApiPollIntervalMillis = conf.getLong(
|
||||||
|
@ -123,45 +147,84 @@ public class UnmanagedApplicationManager {
|
||||||
YARN_CLIENT_APPLICATION_CLIENT_PROTOCOL_POLL_INTERVAL_MS,
|
YARN_CLIENT_APPLICATION_CLIENT_PROTOCOL_POLL_INTERVAL_MS,
|
||||||
YarnConfiguration.
|
YarnConfiguration.
|
||||||
DEFAULT_YARN_CLIENT_APPLICATION_CLIENT_PROTOCOL_POLL_INTERVAL_MS);
|
DEFAULT_YARN_CLIENT_APPLICATION_CLIENT_PROTOCOL_POLL_INTERVAL_MS);
|
||||||
|
this.keepContainersAcrossApplicationAttempts =
|
||||||
|
keepContainersAcrossApplicationAttempts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch a new UAM in the resource manager.
|
||||||
|
*
|
||||||
|
* @return identifier uam identifier
|
||||||
|
* @throws YarnException if fails
|
||||||
|
* @throws IOException if fails
|
||||||
|
*/
|
||||||
|
public Token<AMRMTokenIdentifier> launchUAM()
|
||||||
|
throws YarnException, IOException {
|
||||||
|
this.connectionInitiated = true;
|
||||||
|
|
||||||
|
// Blocking call to RM
|
||||||
|
Token<AMRMTokenIdentifier> amrmToken =
|
||||||
|
initializeUnmanagedAM(this.applicationId);
|
||||||
|
|
||||||
|
// Creates the UAM connection
|
||||||
|
createUAMProxy(amrmToken);
|
||||||
|
return amrmToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Re-attach to an existing UAM in the resource manager.
|
||||||
|
*
|
||||||
|
* @param amrmToken the UAM token
|
||||||
|
* @throws IOException if re-attach fails
|
||||||
|
* @throws YarnException if re-attach fails
|
||||||
|
*/
|
||||||
|
public void reAttachUAM(Token<AMRMTokenIdentifier> amrmToken)
|
||||||
|
throws IOException, YarnException {
|
||||||
|
this.connectionInitiated = true;
|
||||||
|
|
||||||
|
// Creates the UAM connection
|
||||||
|
createUAMProxy(amrmToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void createUAMProxy(Token<AMRMTokenIdentifier> amrmToken)
|
||||||
|
throws IOException {
|
||||||
|
this.userUgi = UserGroupInformation.createProxyUser(
|
||||||
|
this.applicationId.toString(), UserGroupInformation.getCurrentUser());
|
||||||
|
this.rmProxy = createRMProxy(ApplicationMasterProtocol.class, this.conf,
|
||||||
|
this.userUgi, amrmToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Registers this {@link UnmanagedApplicationManager} with the resource
|
* Registers this {@link UnmanagedApplicationManager} with the resource
|
||||||
* manager.
|
* manager.
|
||||||
*
|
*
|
||||||
* @param request the register request
|
* @param request RegisterApplicationMasterRequest
|
||||||
* @return the register response
|
* @return register response
|
||||||
* @throws YarnException if register fails
|
* @throws YarnException if register fails
|
||||||
* @throws IOException if register fails
|
* @throws IOException if register fails
|
||||||
*/
|
*/
|
||||||
public RegisterApplicationMasterResponse createAndRegisterApplicationMaster(
|
public RegisterApplicationMasterResponse registerApplicationMaster(
|
||||||
RegisterApplicationMasterRequest request)
|
RegisterApplicationMasterRequest request)
|
||||||
throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
// This need to be done first in this method, because it is used as an
|
// Save the register request for re-register later
|
||||||
// indication that this method is called (and perhaps blocked due to RM
|
|
||||||
// connection and not finished yet)
|
|
||||||
this.registerRequest = request;
|
this.registerRequest = request;
|
||||||
|
|
||||||
// attemptId will be available after this call
|
// Since we have setKeepContainersAcrossApplicationAttempts = true for UAM.
|
||||||
UnmanagedAMIdentifier identifier =
|
// We do not expect application already registered exception here
|
||||||
initializeUnmanagedAM(this.applicationId);
|
LOG.info("Registering the Unmanaged application master {}",
|
||||||
|
this.applicationId);
|
||||||
try {
|
|
||||||
this.userUgi = UserGroupInformation.createProxyUser(
|
|
||||||
identifier.getAttemptId().toString(),
|
|
||||||
UserGroupInformation.getCurrentUser());
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Exception while trying to get current user", e);
|
|
||||||
throw new YarnRuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.rmProxy = createRMProxy(ApplicationMasterProtocol.class, this.conf,
|
|
||||||
this.userUgi, identifier.getToken());
|
|
||||||
|
|
||||||
LOG.info("Registering the Unmanaged application master {}", this.attemptId);
|
|
||||||
RegisterApplicationMasterResponse response =
|
RegisterApplicationMasterResponse response =
|
||||||
this.rmProxy.registerApplicationMaster(this.registerRequest);
|
this.rmProxy.registerApplicationMaster(this.registerRequest);
|
||||||
|
|
||||||
|
for (Container container : response.getContainersFromPreviousAttempts()) {
|
||||||
|
LOG.info("RegisterUAM returned existing running container "
|
||||||
|
+ container.getId());
|
||||||
|
}
|
||||||
|
for (NMToken nmToken : response.getNMTokensFromPreviousAttempts()) {
|
||||||
|
LOG.info("RegisterUAM returned existing NM token for node "
|
||||||
|
+ nmToken.getNodeId());
|
||||||
|
}
|
||||||
|
|
||||||
// Only when register succeed that we start the heartbeat thread
|
// Only when register succeed that we start the heartbeat thread
|
||||||
this.handlerThread.setUncaughtExceptionHandler(
|
this.handlerThread.setUncaughtExceptionHandler(
|
||||||
new HeartBeatThreadUncaughtExceptionHandler());
|
new HeartBeatThreadUncaughtExceptionHandler());
|
||||||
|
@ -187,11 +250,11 @@ public class UnmanagedApplicationManager {
|
||||||
this.handlerThread.shutdown();
|
this.handlerThread.shutdown();
|
||||||
|
|
||||||
if (this.rmProxy == null) {
|
if (this.rmProxy == null) {
|
||||||
if (this.registerRequest != null) {
|
if (this.connectionInitiated) {
|
||||||
// This is possible if the async registerApplicationMaster is still
|
// This is possible if the async launchUAM is still
|
||||||
// blocked and retrying. Return a dummy response in this case.
|
// blocked and retrying. Return a dummy response in this case.
|
||||||
LOG.warn("Unmanaged AM still not successfully launched/registered yet."
|
LOG.warn("Unmanaged AM still not successfully launched/registered yet."
|
||||||
+ " Stopping the UAM client thread anyways.");
|
+ " Stopping the UAM heartbeat thread anyways.");
|
||||||
return FinishApplicationMasterResponse.newInstance(false);
|
return FinishApplicationMasterResponse.newInstance(false);
|
||||||
} else {
|
} else {
|
||||||
throw new YarnException("finishApplicationMaster should not "
|
throw new YarnException("finishApplicationMaster should not "
|
||||||
|
@ -199,7 +262,7 @@ public class UnmanagedApplicationManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return AMRMClientUtils.finishAMWithReRegister(request, this.rmProxy,
|
return AMRMClientUtils.finishAMWithReRegister(request, this.rmProxy,
|
||||||
this.registerRequest, this.attemptId);
|
this.registerRequest, this.applicationId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -212,7 +275,7 @@ public class UnmanagedApplicationManager {
|
||||||
public KillApplicationResponse forceKillApplication()
|
public KillApplicationResponse forceKillApplication()
|
||||||
throws IOException, YarnException {
|
throws IOException, YarnException {
|
||||||
KillApplicationRequest request =
|
KillApplicationRequest request =
|
||||||
KillApplicationRequest.newInstance(this.attemptId.getApplicationId());
|
KillApplicationRequest.newInstance(this.applicationId);
|
||||||
|
|
||||||
this.handlerThread.shutdown();
|
this.handlerThread.shutdown();
|
||||||
|
|
||||||
|
@ -240,29 +303,29 @@ public class UnmanagedApplicationManager {
|
||||||
LOG.debug("Interrupted while waiting to put on response queue", ex);
|
LOG.debug("Interrupted while waiting to put on response queue", ex);
|
||||||
}
|
}
|
||||||
// Two possible cases why the UAM is not successfully registered yet:
|
// Two possible cases why the UAM is not successfully registered yet:
|
||||||
// 1. registerApplicationMaster is not called at all. Should throw here.
|
// 1. launchUAM is not called at all. Should throw here.
|
||||||
// 2. registerApplicationMaster is called but hasn't successfully returned.
|
// 2. launchUAM is called but hasn't successfully returned.
|
||||||
//
|
//
|
||||||
// In case 2, we have already save the allocate request above, so if the
|
// In case 2, we have already save the allocate request above, so if the
|
||||||
// registration succeed later, no request is lost.
|
// registration succeed later, no request is lost.
|
||||||
if (this.rmProxy == null) {
|
if (this.rmProxy == null) {
|
||||||
if (this.registerRequest != null) {
|
if (this.connectionInitiated) {
|
||||||
LOG.info("Unmanaged AM still not successfully launched/registered yet."
|
LOG.info("Unmanaged AM still not successfully launched/registered yet."
|
||||||
+ " Saving the allocate request and send later.");
|
+ " Saving the allocate request and send later.");
|
||||||
} else {
|
} else {
|
||||||
throw new YarnException(
|
throw new YarnException(
|
||||||
"AllocateAsync should not be called before createAndRegister");
|
"AllocateAsync should not be called before launchUAM");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the application attempt id of the UAM.
|
* Returns the application id of the UAM.
|
||||||
*
|
*
|
||||||
* @return attempt id of the UAM
|
* @return application id of the UAM
|
||||||
*/
|
*/
|
||||||
public ApplicationAttemptId getAttemptId() {
|
public ApplicationId getAppId() {
|
||||||
return this.attemptId;
|
return this.applicationId;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -287,15 +350,15 @@ public class UnmanagedApplicationManager {
|
||||||
* Launch and initialize an unmanaged AM. First, it creates a new application
|
* Launch and initialize an unmanaged AM. First, it creates a new application
|
||||||
* on the RM and negotiates a new attempt id. Then it waits for the RM
|
* on the RM and negotiates a new attempt id. Then it waits for the RM
|
||||||
* application attempt state to reach YarnApplicationAttemptState.LAUNCHED
|
* application attempt state to reach YarnApplicationAttemptState.LAUNCHED
|
||||||
* after which it returns the AM-RM token and the attemptId.
|
* after which it returns the AM-RM token.
|
||||||
*
|
*
|
||||||
* @param appId application id
|
* @param appId application id
|
||||||
* @return the UAM identifier
|
* @return the UAM token
|
||||||
* @throws IOException if initialize fails
|
* @throws IOException if initialize fails
|
||||||
* @throws YarnException if initialize fails
|
* @throws YarnException if initialize fails
|
||||||
*/
|
*/
|
||||||
protected UnmanagedAMIdentifier initializeUnmanagedAM(ApplicationId appId)
|
protected Token<AMRMTokenIdentifier> initializeUnmanagedAM(
|
||||||
throws IOException, YarnException {
|
ApplicationId appId) throws IOException, YarnException {
|
||||||
try {
|
try {
|
||||||
UserGroupInformation appSubmitter =
|
UserGroupInformation appSubmitter =
|
||||||
UserGroupInformation.createRemoteUser(this.submitter);
|
UserGroupInformation.createRemoteUser(this.submitter);
|
||||||
|
@ -306,13 +369,12 @@ public class UnmanagedApplicationManager {
|
||||||
submitUnmanagedApp(appId);
|
submitUnmanagedApp(appId);
|
||||||
|
|
||||||
// Monitor the application attempt to wait for launch state
|
// Monitor the application attempt to wait for launch state
|
||||||
ApplicationAttemptReport attemptReport = monitorCurrentAppAttempt(appId,
|
monitorCurrentAppAttempt(appId,
|
||||||
EnumSet.of(YarnApplicationState.ACCEPTED,
|
EnumSet.of(YarnApplicationState.ACCEPTED,
|
||||||
YarnApplicationState.RUNNING, YarnApplicationState.KILLED,
|
YarnApplicationState.RUNNING, YarnApplicationState.KILLED,
|
||||||
YarnApplicationState.FAILED, YarnApplicationState.FINISHED),
|
YarnApplicationState.FAILED, YarnApplicationState.FINISHED),
|
||||||
YarnApplicationAttemptState.LAUNCHED);
|
YarnApplicationAttemptState.LAUNCHED);
|
||||||
this.attemptId = attemptReport.getApplicationAttemptId();
|
return getUAMToken();
|
||||||
return getUAMIdentifier();
|
|
||||||
} finally {
|
} finally {
|
||||||
this.rmClient = null;
|
this.rmClient = null;
|
||||||
}
|
}
|
||||||
|
@ -343,6 +405,8 @@ public class UnmanagedApplicationManager {
|
||||||
submitRequest.setApplicationSubmissionContext(context);
|
submitRequest.setApplicationSubmissionContext(context);
|
||||||
|
|
||||||
context.setUnmanagedAM(true);
|
context.setUnmanagedAM(true);
|
||||||
|
context.setKeepContainersAcrossApplicationAttempts(
|
||||||
|
this.keepContainersAcrossApplicationAttempts);
|
||||||
|
|
||||||
LOG.info("Submitting unmanaged application {}", appId);
|
LOG.info("Submitting unmanaged application {}", appId);
|
||||||
this.rmClient.submitApplication(submitRequest);
|
this.rmClient.submitApplication(submitRequest);
|
||||||
|
@ -374,8 +438,10 @@ public class UnmanagedApplicationManager {
|
||||||
if (appStates.contains(state)) {
|
if (appStates.contains(state)) {
|
||||||
if (state != YarnApplicationState.ACCEPTED) {
|
if (state != YarnApplicationState.ACCEPTED) {
|
||||||
throw new YarnRuntimeException(
|
throw new YarnRuntimeException(
|
||||||
"Received non-accepted application state: " + state
|
"Received non-accepted application state: " + state + " for "
|
||||||
+ ". Application " + appId + " not the first attempt?");
|
+ appId + ". This is likely because this is not the first "
|
||||||
|
+ "app attempt in home sub-cluster, and AMRMProxy HA "
|
||||||
|
+ "(yarn.nodemanager.amrmproxy.ha.enable) is not enabled.");
|
||||||
}
|
}
|
||||||
appAttemptId =
|
appAttemptId =
|
||||||
getApplicationReport(appId).getCurrentApplicationAttemptId();
|
getApplicationReport(appId).getCurrentApplicationAttemptId();
|
||||||
|
@ -415,25 +481,25 @@ public class UnmanagedApplicationManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the identifier of the unmanaged AM.
|
* Gets the amrmToken of the unmanaged AM.
|
||||||
*
|
*
|
||||||
* @return the identifier of the unmanaged AM.
|
* @return the amrmToken of the unmanaged AM.
|
||||||
* @throws IOException if getApplicationReport fails
|
* @throws IOException if getApplicationReport fails
|
||||||
* @throws YarnException if getApplicationReport fails
|
* @throws YarnException if getApplicationReport fails
|
||||||
*/
|
*/
|
||||||
protected UnmanagedAMIdentifier getUAMIdentifier()
|
protected Token<AMRMTokenIdentifier> getUAMToken()
|
||||||
throws IOException, YarnException {
|
throws IOException, YarnException {
|
||||||
Token<AMRMTokenIdentifier> token = null;
|
Token<AMRMTokenIdentifier> token = null;
|
||||||
org.apache.hadoop.yarn.api.records.Token amrmToken =
|
org.apache.hadoop.yarn.api.records.Token amrmToken =
|
||||||
getApplicationReport(this.attemptId.getApplicationId()).getAMRMToken();
|
getApplicationReport(this.applicationId).getAMRMToken();
|
||||||
if (amrmToken != null) {
|
if (amrmToken != null) {
|
||||||
token = ConverterUtils.convertFromYarn(amrmToken, (Text) null);
|
token = ConverterUtils.convertFromYarn(amrmToken, (Text) null);
|
||||||
} else {
|
} else {
|
||||||
LOG.warn(
|
LOG.warn(
|
||||||
"AMRMToken not found in the application report for application: {}",
|
"AMRMToken not found in the application report for application: {}",
|
||||||
this.attemptId.getApplicationId());
|
this.applicationId);
|
||||||
}
|
}
|
||||||
return new UnmanagedAMIdentifier(this.attemptId, token);
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
private ApplicationReport getApplicationReport(ApplicationId appId)
|
private ApplicationReport getApplicationReport(ApplicationId appId)
|
||||||
|
@ -444,29 +510,6 @@ public class UnmanagedApplicationManager {
|
||||||
return this.rmClient.getApplicationReport(request).getApplicationReport();
|
return this.rmClient.getApplicationReport(request).getApplicationReport();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Data structure that encapsulates the application attempt identifier and the
|
|
||||||
* AMRMTokenIdentifier. Make it public because clients with HA need it.
|
|
||||||
*/
|
|
||||||
public static class UnmanagedAMIdentifier {
|
|
||||||
private ApplicationAttemptId attemptId;
|
|
||||||
private Token<AMRMTokenIdentifier> token;
|
|
||||||
|
|
||||||
public UnmanagedAMIdentifier(ApplicationAttemptId attemptId,
|
|
||||||
Token<AMRMTokenIdentifier> token) {
|
|
||||||
this.attemptId = attemptId;
|
|
||||||
this.token = token;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ApplicationAttemptId getAttemptId() {
|
|
||||||
return this.attemptId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Token<AMRMTokenIdentifier> getToken() {
|
|
||||||
return this.token;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Data structure that encapsulates AllocateRequest and AsyncCallback
|
* Data structure that encapsulates AllocateRequest and AsyncCallback
|
||||||
* instance.
|
* instance.
|
||||||
|
@ -549,8 +592,10 @@ public class UnmanagedApplicationManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
request.setResponseId(lastResponseId);
|
request.setResponseId(lastResponseId);
|
||||||
|
|
||||||
AllocateResponse response = AMRMClientUtils.allocateWithReRegister(
|
AllocateResponse response = AMRMClientUtils.allocateWithReRegister(
|
||||||
request, rmProxy, registerRequest, attemptId);
|
request, rmProxy, registerRequest, applicationId);
|
||||||
|
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
throw new YarnException("Null allocateResponse from allocate");
|
throw new YarnException("Null allocateResponse from allocate");
|
||||||
}
|
}
|
||||||
|
@ -578,18 +623,17 @@ public class UnmanagedApplicationManager {
|
||||||
LOG.debug("Interrupted while waiting for queue", ex);
|
LOG.debug("Interrupted while waiting for queue", ex);
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
LOG.warn(
|
LOG.warn("IO Error occurred while processing heart beat for "
|
||||||
"IO Error occurred while processing heart beat for " + attemptId,
|
+ applicationId, ex);
|
||||||
ex);
|
|
||||||
} catch (Throwable ex) {
|
} catch (Throwable ex) {
|
||||||
LOG.warn(
|
LOG.warn(
|
||||||
"Error occurred while processing heart beat for " + attemptId,
|
"Error occurred while processing heart beat for " + applicationId,
|
||||||
ex);
|
ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG.info("UnmanagedApplicationManager has been stopped for {}. "
|
LOG.info("UnmanagedApplicationManager has been stopped for {}. "
|
||||||
+ "AMRequestHandlerThread thread is exiting", attemptId);
|
+ "AMRequestHandlerThread thread is exiting", applicationId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -600,8 +644,8 @@ public class UnmanagedApplicationManager {
|
||||||
implements UncaughtExceptionHandler {
|
implements UncaughtExceptionHandler {
|
||||||
@Override
|
@Override
|
||||||
public void uncaughtException(Thread t, Throwable e) {
|
public void uncaughtException(Thread t, Throwable e) {
|
||||||
LOG.error("Heartbeat thread {} for application attempt {} crashed!",
|
LOG.error("Heartbeat thread {} for application {} crashed!",
|
||||||
t.getName(), attemptId, e);
|
t.getName(), applicationId, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -36,7 +36,7 @@ import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.client.ClientRMProxy;
|
import org.apache.hadoop.yarn.client.ClientRMProxy;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException;
|
import org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException;
|
||||||
|
@ -63,16 +63,16 @@ public final class AMRMClientUtils {
|
||||||
/**
|
/**
|
||||||
* Handle ApplicationNotRegistered exception and re-register.
|
* Handle ApplicationNotRegistered exception and re-register.
|
||||||
*
|
*
|
||||||
* @param attemptId app attemptId
|
* @param appId application Id
|
||||||
* @param rmProxy RM proxy instance
|
* @param rmProxy RM proxy instance
|
||||||
* @param registerRequest the AM re-register request
|
* @param registerRequest the AM re-register request
|
||||||
* @throws YarnException if re-register fails
|
* @throws YarnException if re-register fails
|
||||||
*/
|
*/
|
||||||
public static void handleNotRegisteredExceptionAndReRegister(
|
public static void handleNotRegisteredExceptionAndReRegister(
|
||||||
ApplicationAttemptId attemptId, ApplicationMasterProtocol rmProxy,
|
ApplicationId appId, ApplicationMasterProtocol rmProxy,
|
||||||
RegisterApplicationMasterRequest registerRequest) throws YarnException {
|
RegisterApplicationMasterRequest registerRequest) throws YarnException {
|
||||||
LOG.info("App attempt {} not registered, most likely due to RM failover. "
|
LOG.info("App attempt {} not registered, most likely due to RM failover. "
|
||||||
+ " Trying to re-register.", attemptId);
|
+ " Trying to re-register.", appId);
|
||||||
try {
|
try {
|
||||||
rmProxy.registerApplicationMaster(registerRequest);
|
rmProxy.registerApplicationMaster(registerRequest);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -93,25 +93,24 @@ public final class AMRMClientUtils {
|
||||||
* @param request allocate request
|
* @param request allocate request
|
||||||
* @param rmProxy RM proxy
|
* @param rmProxy RM proxy
|
||||||
* @param registerRequest the register request for re-register
|
* @param registerRequest the register request for re-register
|
||||||
* @param attemptId application attempt id
|
* @param appId application id
|
||||||
* @return allocate response
|
* @return allocate response
|
||||||
* @throws YarnException if RM call fails
|
* @throws YarnException if RM call fails
|
||||||
* @throws IOException if RM call fails
|
* @throws IOException if RM call fails
|
||||||
*/
|
*/
|
||||||
public static AllocateResponse allocateWithReRegister(AllocateRequest request,
|
public static AllocateResponse allocateWithReRegister(AllocateRequest request,
|
||||||
ApplicationMasterProtocol rmProxy,
|
ApplicationMasterProtocol rmProxy,
|
||||||
RegisterApplicationMasterRequest registerRequest,
|
RegisterApplicationMasterRequest registerRequest, ApplicationId appId)
|
||||||
ApplicationAttemptId attemptId) throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
try {
|
try {
|
||||||
return rmProxy.allocate(request);
|
return rmProxy.allocate(request);
|
||||||
} catch (ApplicationMasterNotRegisteredException e) {
|
} catch (ApplicationMasterNotRegisteredException e) {
|
||||||
handleNotRegisteredExceptionAndReRegister(attemptId, rmProxy,
|
handleNotRegisteredExceptionAndReRegister(appId, rmProxy,
|
||||||
registerRequest);
|
registerRequest);
|
||||||
// reset responseId after re-register
|
// reset responseId after re-register
|
||||||
request.setResponseId(0);
|
request.setResponseId(0);
|
||||||
// retry allocate
|
// retry allocate
|
||||||
return allocateWithReRegister(request, rmProxy, registerRequest,
|
return allocateWithReRegister(request, rmProxy, registerRequest, appId);
|
||||||
attemptId);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,23 +122,22 @@ public final class AMRMClientUtils {
|
||||||
* @param request finishApplicationMaster request
|
* @param request finishApplicationMaster request
|
||||||
* @param rmProxy RM proxy
|
* @param rmProxy RM proxy
|
||||||
* @param registerRequest the register request for re-register
|
* @param registerRequest the register request for re-register
|
||||||
* @param attemptId application attempt id
|
* @param appId application id
|
||||||
* @return finishApplicationMaster response
|
* @return finishApplicationMaster response
|
||||||
* @throws YarnException if RM call fails
|
* @throws YarnException if RM call fails
|
||||||
* @throws IOException if RM call fails
|
* @throws IOException if RM call fails
|
||||||
*/
|
*/
|
||||||
public static FinishApplicationMasterResponse finishAMWithReRegister(
|
public static FinishApplicationMasterResponse finishAMWithReRegister(
|
||||||
FinishApplicationMasterRequest request, ApplicationMasterProtocol rmProxy,
|
FinishApplicationMasterRequest request, ApplicationMasterProtocol rmProxy,
|
||||||
RegisterApplicationMasterRequest registerRequest,
|
RegisterApplicationMasterRequest registerRequest, ApplicationId appId)
|
||||||
ApplicationAttemptId attemptId) throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
try {
|
try {
|
||||||
return rmProxy.finishApplicationMaster(request);
|
return rmProxy.finishApplicationMaster(request);
|
||||||
} catch (ApplicationMasterNotRegisteredException ex) {
|
} catch (ApplicationMasterNotRegisteredException ex) {
|
||||||
handleNotRegisteredExceptionAndReRegister(attemptId, rmProxy,
|
handleNotRegisteredExceptionAndReRegister(appId, rmProxy,
|
||||||
registerRequest);
|
registerRequest);
|
||||||
// retry finishAM after re-register
|
// retry finishAM after re-register
|
||||||
return finishAMWithReRegister(request, rmProxy, registerRequest,
|
return finishAMWithReRegister(request, rmProxy, registerRequest, appId);
|
||||||
attemptId);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -105,6 +105,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
import org.apache.hadoop.yarn.api.records.ApplicationReport;
|
||||||
import org.apache.hadoop.yarn.api.records.Container;
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.NMToken;
|
import org.apache.hadoop.yarn.api.records.NMToken;
|
||||||
|
@ -171,10 +172,9 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
LoggerFactory.getLogger(MockResourceManagerFacade.class);
|
LoggerFactory.getLogger(MockResourceManagerFacade.class);
|
||||||
|
|
||||||
private HashSet<ApplicationId> applicationMap = new HashSet<>();
|
private HashSet<ApplicationId> applicationMap = new HashSet<>();
|
||||||
private HashMap<String, List<ContainerId>> applicationContainerIdMap =
|
private HashSet<ApplicationId> keepContainerOnUams = new HashSet<>();
|
||||||
new HashMap<String, List<ContainerId>>();
|
private HashMap<ApplicationAttemptId, List<ContainerId>>
|
||||||
private HashMap<ContainerId, Container> allocatedContainerMap =
|
applicationContainerIdMap = new HashMap<>();
|
||||||
new HashMap<ContainerId, Container>();
|
|
||||||
private AtomicInteger containerIndex = new AtomicInteger(0);
|
private AtomicInteger containerIndex = new AtomicInteger(0);
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
private int subClusterId;
|
private int subClusterId;
|
||||||
|
@ -215,7 +215,7 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
this.isRunning = mode;
|
this.isRunning = mode;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getAppIdentifier() throws IOException {
|
private static ApplicationAttemptId getAppIdentifier() throws IOException {
|
||||||
AMRMTokenIdentifier result = null;
|
AMRMTokenIdentifier result = null;
|
||||||
UserGroupInformation remoteUgi = UserGroupInformation.getCurrentUser();
|
UserGroupInformation remoteUgi = UserGroupInformation.getCurrentUser();
|
||||||
Set<TokenIdentifier> tokenIds = remoteUgi.getTokenIdentifiers();
|
Set<TokenIdentifier> tokenIds = remoteUgi.getTokenIdentifiers();
|
||||||
|
@ -225,7 +225,8 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result != null ? result.getApplicationAttemptId().toString() : "";
|
return result != null ? result.getApplicationAttemptId()
|
||||||
|
: ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 0), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void validateRunning() throws ConnectException {
|
private void validateRunning() throws ConnectException {
|
||||||
|
@ -240,19 +241,32 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
|
|
||||||
validateRunning();
|
validateRunning();
|
||||||
|
ApplicationAttemptId attemptId = getAppIdentifier();
|
||||||
String amrmToken = getAppIdentifier();
|
LOG.info("Registering application attempt: " + attemptId);
|
||||||
LOG.info("Registering application attempt: " + amrmToken);
|
|
||||||
|
|
||||||
shouldReRegisterNext = false;
|
shouldReRegisterNext = false;
|
||||||
|
|
||||||
|
List<Container> containersFromPreviousAttempt = null;
|
||||||
|
|
||||||
synchronized (applicationContainerIdMap) {
|
synchronized (applicationContainerIdMap) {
|
||||||
if (applicationContainerIdMap.containsKey(amrmToken)) {
|
if (applicationContainerIdMap.containsKey(attemptId)) {
|
||||||
throw new InvalidApplicationMasterRequestException(
|
if (keepContainerOnUams.contains(attemptId.getApplicationId())) {
|
||||||
AMRMClientUtils.APP_ALREADY_REGISTERED_MESSAGE);
|
// For UAM with the keepContainersFromPreviousAttempt flag, return all
|
||||||
|
// running containers
|
||||||
|
containersFromPreviousAttempt = new ArrayList<>();
|
||||||
|
for (ContainerId containerId : applicationContainerIdMap
|
||||||
|
.get(attemptId)) {
|
||||||
|
containersFromPreviousAttempt.add(Container.newInstance(containerId,
|
||||||
|
null, null, null, null, null));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new InvalidApplicationMasterRequestException(
|
||||||
|
AMRMClientUtils.APP_ALREADY_REGISTERED_MESSAGE);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Keep track of the containers that are returned to this application
|
||||||
|
applicationContainerIdMap.put(attemptId, new ArrayList<ContainerId>());
|
||||||
}
|
}
|
||||||
// Keep track of the containers that are returned to this application
|
|
||||||
applicationContainerIdMap.put(amrmToken, new ArrayList<ContainerId>());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure we wait for certain test cases last in the method
|
// Make sure we wait for certain test cases last in the method
|
||||||
|
@ -272,7 +286,7 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
}
|
}
|
||||||
|
|
||||||
return RegisterApplicationMasterResponse.newInstance(null, null, null, null,
|
return RegisterApplicationMasterResponse.newInstance(null, null, null, null,
|
||||||
null, request.getHost(), null);
|
containersFromPreviousAttempt, request.getHost(), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -282,8 +296,8 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
|
|
||||||
validateRunning();
|
validateRunning();
|
||||||
|
|
||||||
String amrmToken = getAppIdentifier();
|
ApplicationAttemptId attemptId = getAppIdentifier();
|
||||||
LOG.info("Finishing application attempt: " + amrmToken);
|
LOG.info("Finishing application attempt: " + attemptId);
|
||||||
|
|
||||||
if (shouldReRegisterNext) {
|
if (shouldReRegisterNext) {
|
||||||
String message = "AM is not registered, should re-register.";
|
String message = "AM is not registered, should re-register.";
|
||||||
|
@ -293,12 +307,9 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
|
|
||||||
synchronized (applicationContainerIdMap) {
|
synchronized (applicationContainerIdMap) {
|
||||||
// Remove the containers that were being tracked for this application
|
// Remove the containers that were being tracked for this application
|
||||||
Assert.assertTrue("The application id is NOT registered: " + amrmToken,
|
Assert.assertTrue("The application id is NOT registered: " + attemptId,
|
||||||
applicationContainerIdMap.containsKey(amrmToken));
|
applicationContainerIdMap.containsKey(attemptId));
|
||||||
List<ContainerId> ids = applicationContainerIdMap.remove(amrmToken);
|
applicationContainerIdMap.remove(attemptId);
|
||||||
for (ContainerId c : ids) {
|
|
||||||
allocatedContainerMap.remove(c);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return FinishApplicationMasterResponse.newInstance(
|
return FinishApplicationMasterResponse.newInstance(
|
||||||
|
@ -328,8 +339,8 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
+ "askList and releaseList in the same heartbeat");
|
+ "askList and releaseList in the same heartbeat");
|
||||||
}
|
}
|
||||||
|
|
||||||
String amrmToken = getAppIdentifier();
|
ApplicationAttemptId attemptId = getAppIdentifier();
|
||||||
LOG.info("Allocate from application attempt: " + amrmToken);
|
LOG.info("Allocate from application attempt: " + attemptId);
|
||||||
|
|
||||||
if (shouldReRegisterNext) {
|
if (shouldReRegisterNext) {
|
||||||
String message = "AM is not registered, should re-register.";
|
String message = "AM is not registered, should re-register.";
|
||||||
|
@ -361,16 +372,16 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
// will need it in future
|
// will need it in future
|
||||||
Assert.assertTrue(
|
Assert.assertTrue(
|
||||||
"The application id is Not registered before allocate(): "
|
"The application id is Not registered before allocate(): "
|
||||||
+ amrmToken,
|
+ attemptId,
|
||||||
applicationContainerIdMap.containsKey(amrmToken));
|
applicationContainerIdMap.containsKey(attemptId));
|
||||||
List<ContainerId> ids = applicationContainerIdMap.get(amrmToken);
|
List<ContainerId> ids = applicationContainerIdMap.get(attemptId);
|
||||||
ids.add(containerId);
|
ids.add(containerId);
|
||||||
this.allocatedContainerMap.put(containerId, container);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<ContainerStatus> completedList = new ArrayList<>();
|
||||||
if (request.getReleaseList() != null
|
if (request.getReleaseList() != null
|
||||||
&& request.getReleaseList().size() > 0) {
|
&& request.getReleaseList().size() > 0) {
|
||||||
LOG.info("Releasing containers: " + request.getReleaseList().size());
|
LOG.info("Releasing containers: " + request.getReleaseList().size());
|
||||||
|
@ -378,9 +389,9 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
Assert
|
Assert
|
||||||
.assertTrue(
|
.assertTrue(
|
||||||
"The application id is not registered before allocate(): "
|
"The application id is not registered before allocate(): "
|
||||||
+ amrmToken,
|
+ attemptId,
|
||||||
applicationContainerIdMap.containsKey(amrmToken));
|
applicationContainerIdMap.containsKey(attemptId));
|
||||||
List<ContainerId> ids = applicationContainerIdMap.get(amrmToken);
|
List<ContainerId> ids = applicationContainerIdMap.get(attemptId);
|
||||||
|
|
||||||
for (ContainerId id : request.getReleaseList()) {
|
for (ContainerId id : request.getReleaseList()) {
|
||||||
boolean found = false;
|
boolean found = false;
|
||||||
|
@ -396,18 +407,8 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
+ conf.get("AMRMTOKEN"), found);
|
+ conf.get("AMRMTOKEN"), found);
|
||||||
|
|
||||||
ids.remove(id);
|
ids.remove(id);
|
||||||
|
completedList.add(
|
||||||
// Return the released container back to the AM with new fake Ids. The
|
ContainerStatus.newInstance(id, ContainerState.COMPLETE, "", 0));
|
||||||
// test case does not care about the IDs. The IDs are faked because
|
|
||||||
// otherwise the LRM will throw duplication identifier exception. This
|
|
||||||
// returning of fake containers is ONLY done for testing purpose - for
|
|
||||||
// the test code to get confirmation that the sub-cluster resource
|
|
||||||
// managers received the release request
|
|
||||||
ContainerId fakeContainerId = ContainerId.newInstance(
|
|
||||||
getApplicationAttemptId(1), containerIndex.incrementAndGet());
|
|
||||||
Container fakeContainer = allocatedContainerMap.get(id);
|
|
||||||
fakeContainer.setId(fakeContainerId);
|
|
||||||
containerList.add(fakeContainer);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -418,8 +419,7 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
// Always issue a new AMRMToken as if RM rolled master key
|
// Always issue a new AMRMToken as if RM rolled master key
|
||||||
Token newAMRMToken = Token.newInstance(new byte[0], "", new byte[0], "");
|
Token newAMRMToken = Token.newInstance(new byte[0], "", new byte[0], "");
|
||||||
|
|
||||||
return AllocateResponse.newInstance(0,
|
return AllocateResponse.newInstance(0, completedList, containerList,
|
||||||
new ArrayList<ContainerStatus>(), containerList,
|
|
||||||
new ArrayList<NodeReport>(), null, AMCommand.AM_RESYNC, 1, null,
|
new ArrayList<NodeReport>(), null, AMCommand.AM_RESYNC, 1, null,
|
||||||
new ArrayList<NMToken>(), newAMRMToken,
|
new ArrayList<NMToken>(), newAMRMToken,
|
||||||
new ArrayList<UpdatedContainer>(), null);
|
new ArrayList<UpdatedContainer>(), null);
|
||||||
|
@ -438,6 +438,7 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
report.setApplicationId(request.getApplicationId());
|
report.setApplicationId(request.getApplicationId());
|
||||||
report.setCurrentApplicationAttemptId(
|
report.setCurrentApplicationAttemptId(
|
||||||
ApplicationAttemptId.newInstance(request.getApplicationId(), 1));
|
ApplicationAttemptId.newInstance(request.getApplicationId(), 1));
|
||||||
|
report.setAMRMToken(Token.newInstance(new byte[0], "", new byte[0], ""));
|
||||||
response.setApplicationReport(report);
|
response.setApplicationReport(report);
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
@ -481,6 +482,12 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
}
|
}
|
||||||
LOG.info("Application submitted: " + appId);
|
LOG.info("Application submitted: " + appId);
|
||||||
applicationMap.add(appId);
|
applicationMap.add(appId);
|
||||||
|
|
||||||
|
if (request.getApplicationSubmissionContext().getUnmanagedAM()
|
||||||
|
|| request.getApplicationSubmissionContext()
|
||||||
|
.getKeepContainersAcrossApplicationAttempts()) {
|
||||||
|
keepContainerOnUams.add(appId);
|
||||||
|
}
|
||||||
return SubmitApplicationResponse.newInstance();
|
return SubmitApplicationResponse.newInstance();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -497,6 +504,7 @@ public class MockResourceManagerFacade implements ApplicationClientProtocol,
|
||||||
throw new ApplicationNotFoundException(
|
throw new ApplicationNotFoundException(
|
||||||
"Trying to kill an absent application: " + appId);
|
"Trying to kill an absent application: " + appId);
|
||||||
}
|
}
|
||||||
|
keepContainerOnUams.remove(appId);
|
||||||
}
|
}
|
||||||
LOG.info("Force killing application: " + appId);
|
LOG.info("Force killing application: " + appId);
|
||||||
return KillApplicationResponse.newInstance(true);
|
return KillApplicationResponse.newInstance(true);
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.federation.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
|
import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService;
|
||||||
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
import org.apache.hadoop.security.token.Token;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit test for FederationRegistryClient.
|
||||||
|
*/
|
||||||
|
public class TestFederationRegistryClient {
|
||||||
|
private Configuration conf;
|
||||||
|
private UserGroupInformation user;
|
||||||
|
private RegistryOperations registry;
|
||||||
|
private FederationRegistryClient registryClient;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() throws Exception {
|
||||||
|
this.conf = new YarnConfiguration();
|
||||||
|
|
||||||
|
this.registry = new FSRegistryOperationsService();
|
||||||
|
this.registry.init(this.conf);
|
||||||
|
this.registry.start();
|
||||||
|
|
||||||
|
this.user = UserGroupInformation.getCurrentUser();
|
||||||
|
this.registryClient =
|
||||||
|
new FederationRegistryClient(this.conf, this.registry, this.user);
|
||||||
|
this.registryClient.cleanAllApplications();
|
||||||
|
Assert.assertEquals(0, this.registryClient.getAllApplications().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void breakDown() {
|
||||||
|
registryClient.cleanAllApplications();
|
||||||
|
Assert.assertEquals(0, registryClient.getAllApplications().size());
|
||||||
|
registry.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasicCase() {
|
||||||
|
ApplicationId appId = ApplicationId.newInstance(0, 0);
|
||||||
|
String scId1 = "subcluster1";
|
||||||
|
String scId2 = "subcluster2";
|
||||||
|
|
||||||
|
this.registryClient.writeAMRMTokenForUAM(appId, scId1,
|
||||||
|
new Token<AMRMTokenIdentifier>());
|
||||||
|
this.registryClient.writeAMRMTokenForUAM(appId, scId2,
|
||||||
|
new Token<AMRMTokenIdentifier>());
|
||||||
|
// Duplicate entry, should overwrite
|
||||||
|
this.registryClient.writeAMRMTokenForUAM(appId, scId1,
|
||||||
|
new Token<AMRMTokenIdentifier>());
|
||||||
|
|
||||||
|
Assert.assertEquals(1, this.registryClient.getAllApplications().size());
|
||||||
|
Assert.assertEquals(2,
|
||||||
|
this.registryClient.loadStateFromRegistry(appId).size());
|
||||||
|
|
||||||
|
this.registryClient.removeAppFromRegistry(appId);
|
||||||
|
|
||||||
|
Assert.assertEquals(0, this.registryClient.getAllApplications().size());
|
||||||
|
Assert.assertEquals(0,
|
||||||
|
this.registryClient.loadStateFromRegistry(appId).size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -65,7 +65,7 @@ public class TestUnmanagedApplicationManager {
|
||||||
ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1);
|
ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1);
|
||||||
|
|
||||||
uam = new TestableUnmanagedApplicationManager(conf,
|
uam = new TestableUnmanagedApplicationManager(conf,
|
||||||
attemptId.getApplicationId(), null, "submitter", "appNameSuffix");
|
attemptId.getApplicationId(), null, "submitter", "appNameSuffix", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void waitForCallBackCountAndCheckZeroPending(
|
protected void waitForCallBackCountAndCheckZeroPending(
|
||||||
|
@ -88,7 +88,8 @@ public class TestUnmanagedApplicationManager {
|
||||||
public void testBasicUsage()
|
public void testBasicUsage()
|
||||||
throws YarnException, IOException, InterruptedException {
|
throws YarnException, IOException, InterruptedException {
|
||||||
|
|
||||||
createAndRegisterApplicationMaster(
|
launchUAM(attemptId);
|
||||||
|
registerApplicationMaster(
|
||||||
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
||||||
|
|
||||||
allocateAsync(AllocateRequest.newInstance(0, 0, null, null, null), callback,
|
allocateAsync(AllocateRequest.newInstance(0, 0, null, null, null), callback,
|
||||||
|
@ -102,11 +103,48 @@ public class TestUnmanagedApplicationManager {
|
||||||
attemptId);
|
attemptId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test re-attaching of an existing UAM. This is for HA of UAM client.
|
||||||
|
*/
|
||||||
|
@Test(timeout = 5000)
|
||||||
|
public void testUAMReAttach()
|
||||||
|
throws YarnException, IOException, InterruptedException {
|
||||||
|
|
||||||
|
launchUAM(attemptId);
|
||||||
|
registerApplicationMaster(
|
||||||
|
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
||||||
|
|
||||||
|
allocateAsync(AllocateRequest.newInstance(0, 0, null, null, null), callback,
|
||||||
|
attemptId);
|
||||||
|
// Wait for outstanding async allocate callback
|
||||||
|
waitForCallBackCountAndCheckZeroPending(callback, 1);
|
||||||
|
|
||||||
|
MockResourceManagerFacade rmProxy = uam.getRMProxy();
|
||||||
|
uam = new TestableUnmanagedApplicationManager(conf,
|
||||||
|
attemptId.getApplicationId(), null, "submitter", "appNameSuffix", true);
|
||||||
|
uam.setRMProxy(rmProxy);
|
||||||
|
|
||||||
|
reAttachUAM(null, attemptId);
|
||||||
|
registerApplicationMaster(
|
||||||
|
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
||||||
|
|
||||||
|
allocateAsync(AllocateRequest.newInstance(0, 0, null, null, null), callback,
|
||||||
|
attemptId);
|
||||||
|
|
||||||
|
// Wait for outstanding async allocate callback
|
||||||
|
waitForCallBackCountAndCheckZeroPending(callback, 2);
|
||||||
|
|
||||||
|
finishApplicationMaster(
|
||||||
|
FinishApplicationMasterRequest.newInstance(null, null, null),
|
||||||
|
attemptId);
|
||||||
|
}
|
||||||
|
|
||||||
@Test(timeout = 5000)
|
@Test(timeout = 5000)
|
||||||
public void testReRegister()
|
public void testReRegister()
|
||||||
throws YarnException, IOException, InterruptedException {
|
throws YarnException, IOException, InterruptedException {
|
||||||
|
|
||||||
createAndRegisterApplicationMaster(
|
launchUAM(attemptId);
|
||||||
|
registerApplicationMaster(
|
||||||
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
||||||
|
|
||||||
uam.setShouldReRegisterNext();
|
uam.setShouldReRegisterNext();
|
||||||
|
@ -137,7 +175,8 @@ public class TestUnmanagedApplicationManager {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
try {
|
try {
|
||||||
createAndRegisterApplicationMaster(
|
launchUAM(attemptId);
|
||||||
|
registerApplicationMaster(
|
||||||
RegisterApplicationMasterRequest.newInstance(null, 1001, null),
|
RegisterApplicationMasterRequest.newInstance(null, 1001, null),
|
||||||
attemptId);
|
attemptId);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -221,7 +260,8 @@ public class TestUnmanagedApplicationManager {
|
||||||
@Test
|
@Test
|
||||||
public void testForceKill()
|
public void testForceKill()
|
||||||
throws YarnException, IOException, InterruptedException {
|
throws YarnException, IOException, InterruptedException {
|
||||||
createAndRegisterApplicationMaster(
|
launchUAM(attemptId);
|
||||||
|
registerApplicationMaster(
|
||||||
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
|
||||||
uam.forceKillApplication();
|
uam.forceKillApplication();
|
||||||
|
|
||||||
|
@ -241,19 +281,40 @@ public class TestUnmanagedApplicationManager {
|
||||||
return ugi;
|
return ugi;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected RegisterApplicationMasterResponse
|
protected Token<AMRMTokenIdentifier> launchUAM(
|
||||||
createAndRegisterApplicationMaster(
|
ApplicationAttemptId appAttemptId)
|
||||||
final RegisterApplicationMasterRequest request,
|
throws IOException, InterruptedException {
|
||||||
ApplicationAttemptId appAttemptId)
|
return getUGIWithToken(appAttemptId)
|
||||||
throws YarnException, IOException, InterruptedException {
|
.doAs(new PrivilegedExceptionAction<Token<AMRMTokenIdentifier>>() {
|
||||||
|
@Override
|
||||||
|
public Token<AMRMTokenIdentifier> run() throws Exception {
|
||||||
|
return uam.launchUAM();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void reAttachUAM(final Token<AMRMTokenIdentifier> uamToken,
|
||||||
|
ApplicationAttemptId appAttemptId)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
|
getUGIWithToken(appAttemptId).doAs(new PrivilegedExceptionAction<Object>() {
|
||||||
|
@Override
|
||||||
|
public Token<AMRMTokenIdentifier> run() throws Exception {
|
||||||
|
uam.reAttachUAM(uamToken);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
protected RegisterApplicationMasterResponse registerApplicationMaster(
|
||||||
|
final RegisterApplicationMasterRequest request,
|
||||||
|
ApplicationAttemptId appAttemptId)
|
||||||
|
throws YarnException, IOException, InterruptedException {
|
||||||
return getUGIWithToken(appAttemptId).doAs(
|
return getUGIWithToken(appAttemptId).doAs(
|
||||||
new PrivilegedExceptionAction<RegisterApplicationMasterResponse>() {
|
new PrivilegedExceptionAction<RegisterApplicationMasterResponse>() {
|
||||||
@Override
|
@Override
|
||||||
public RegisterApplicationMasterResponse run()
|
public RegisterApplicationMasterResponse run()
|
||||||
throws YarnException, IOException {
|
throws YarnException, IOException {
|
||||||
RegisterApplicationMasterResponse response =
|
return uam.registerApplicationMaster(request);
|
||||||
uam.createAndRegisterApplicationMaster(request);
|
|
||||||
return response;
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -311,8 +372,9 @@ public class TestUnmanagedApplicationManager {
|
||||||
|
|
||||||
public TestableUnmanagedApplicationManager(Configuration conf,
|
public TestableUnmanagedApplicationManager(Configuration conf,
|
||||||
ApplicationId appId, String queueName, String submitter,
|
ApplicationId appId, String queueName, String submitter,
|
||||||
String appNameSuffix) {
|
String appNameSuffix, boolean keepContainersAcrossApplicationAttempts) {
|
||||||
super(conf, appId, queueName, submitter, appNameSuffix);
|
super(conf, appId, queueName, submitter, appNameSuffix,
|
||||||
|
keepContainersAcrossApplicationAttempts);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
|
@ -330,6 +392,14 @@ public class TestUnmanagedApplicationManager {
|
||||||
rmProxy.setShouldReRegisterNext();
|
rmProxy.setShouldReRegisterNext();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public MockResourceManagerFacade getRMProxy() {
|
||||||
|
return rmProxy;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRMProxy(MockResourceManagerFacade proxy) {
|
||||||
|
this.rmProxy = proxy;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -19,6 +19,8 @@
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
|
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
|
import org.apache.hadoop.security.Credentials;
|
||||||
import org.apache.hadoop.security.token.Token;
|
import org.apache.hadoop.security.token.Token;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
|
@ -67,4 +69,18 @@ public interface AMRMProxyApplicationContext {
|
||||||
*/
|
*/
|
||||||
Context getNMCotext();
|
Context getNMCotext();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the credentials of this application.
|
||||||
|
*
|
||||||
|
* @return the credentials.
|
||||||
|
*/
|
||||||
|
Credentials getCredentials();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the registry client.
|
||||||
|
*
|
||||||
|
* @return the registry.
|
||||||
|
*/
|
||||||
|
RegistryOperations getRegistryClient();
|
||||||
|
|
||||||
}
|
}
|
|
@ -22,6 +22,8 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
|
import org.apache.hadoop.security.Credentials;
|
||||||
import org.apache.hadoop.security.token.Token;
|
import org.apache.hadoop.security.token.Token;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
|
@ -42,6 +44,8 @@ public class AMRMProxyApplicationContextImpl implements
|
||||||
private Integer localTokenKeyId;
|
private Integer localTokenKeyId;
|
||||||
private Token<AMRMTokenIdentifier> amrmToken;
|
private Token<AMRMTokenIdentifier> amrmToken;
|
||||||
private Token<AMRMTokenIdentifier> localToken;
|
private Token<AMRMTokenIdentifier> localToken;
|
||||||
|
private Credentials credentials;
|
||||||
|
private RegistryOperations registry;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an instance of the AMRMProxyApplicationContext.
|
* Create an instance of the AMRMProxyApplicationContext.
|
||||||
|
@ -52,17 +56,23 @@ public class AMRMProxyApplicationContextImpl implements
|
||||||
* @param user user name of the application
|
* @param user user name of the application
|
||||||
* @param amrmToken amrmToken issued by RM
|
* @param amrmToken amrmToken issued by RM
|
||||||
* @param localToken amrmToken issued by AMRMProxy
|
* @param localToken amrmToken issued by AMRMProxy
|
||||||
|
* @param credentials application credentials
|
||||||
|
* @param registry Yarn Registry client
|
||||||
*/
|
*/
|
||||||
public AMRMProxyApplicationContextImpl(Context nmContext,
|
@SuppressWarnings("checkstyle:parameternumber")
|
||||||
Configuration conf, ApplicationAttemptId applicationAttemptId,
|
public AMRMProxyApplicationContextImpl(Context nmContext, Configuration conf,
|
||||||
String user, Token<AMRMTokenIdentifier> amrmToken,
|
ApplicationAttemptId applicationAttemptId, String user,
|
||||||
Token<AMRMTokenIdentifier> localToken) {
|
Token<AMRMTokenIdentifier> amrmToken,
|
||||||
|
Token<AMRMTokenIdentifier> localToken, Credentials credentials,
|
||||||
|
RegistryOperations registry) {
|
||||||
this.nmContext = nmContext;
|
this.nmContext = nmContext;
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
this.applicationAttemptId = applicationAttemptId;
|
this.applicationAttemptId = applicationAttemptId;
|
||||||
this.user = user;
|
this.user = user;
|
||||||
this.amrmToken = amrmToken;
|
this.amrmToken = amrmToken;
|
||||||
this.localToken = localToken;
|
this.localToken = localToken;
|
||||||
|
this.credentials = credentials;
|
||||||
|
this.registry = registry;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -88,11 +98,14 @@ public class AMRMProxyApplicationContextImpl implements
|
||||||
/**
|
/**
|
||||||
* Sets the application's AMRMToken.
|
* Sets the application's AMRMToken.
|
||||||
*
|
*
|
||||||
* @param amrmToken amrmToken issued by RM
|
* @param amrmToken the new amrmToken from RM
|
||||||
|
* @return whether the saved token is updated to a different value
|
||||||
*/
|
*/
|
||||||
public synchronized void setAMRMToken(
|
public synchronized boolean setAMRMToken(
|
||||||
Token<AMRMTokenIdentifier> amrmToken) {
|
Token<AMRMTokenIdentifier> amrmToken) {
|
||||||
|
Token<AMRMTokenIdentifier> oldValue = this.amrmToken;
|
||||||
this.amrmToken = amrmToken;
|
this.amrmToken = amrmToken;
|
||||||
|
return !this.amrmToken.equals(oldValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -134,4 +147,14 @@ public class AMRMProxyApplicationContextImpl implements
|
||||||
public Context getNMCotext() {
|
public Context getNMCotext() {
|
||||||
return nmContext;
|
return nmContext;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Credentials getCredentials() {
|
||||||
|
return this.credentials;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RegistryOperations getRegistryClient() {
|
||||||
|
return this.registry;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -34,12 +34,13 @@ import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||||
import org.apache.hadoop.io.DataOutputBuffer;
|
import org.apache.hadoop.io.DataOutputBuffer;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.ipc.Server;
|
import org.apache.hadoop.ipc.Server;
|
||||||
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
import org.apache.hadoop.security.Credentials;
|
import org.apache.hadoop.security.Credentials;
|
||||||
import org.apache.hadoop.security.SaslRpcServer;
|
import org.apache.hadoop.security.SaslRpcServer;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.security.token.Token;
|
import org.apache.hadoop.security.token.Token;
|
||||||
import org.apache.hadoop.security.token.TokenIdentifier;
|
import org.apache.hadoop.security.token.TokenIdentifier;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.CompositeService;
|
||||||
import org.apache.hadoop.util.ReflectionUtils;
|
import org.apache.hadoop.util.ReflectionUtils;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
|
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
|
||||||
|
@ -60,15 +61,19 @@ import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.apache.hadoop.yarn.ipc.YarnRPC;
|
import org.apache.hadoop.yarn.ipc.YarnRPC;
|
||||||
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||||
|
import org.apache.hadoop.yarn.server.api.ContainerType;
|
||||||
|
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.scheduler.DistributedScheduler;
|
import org.apache.hadoop.yarn.server.nodemanager.scheduler.DistributedScheduler;
|
||||||
import org.apache.hadoop.yarn.server.security.MasterKeyData;
|
import org.apache.hadoop.yarn.server.security.MasterKeyData;
|
||||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||||
import org.apache.hadoop.yarn.server.utils.YarnServerSecurityUtils;
|
import org.apache.hadoop.yarn.server.utils.YarnServerSecurityUtils;
|
||||||
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -82,7 +87,7 @@ import com.google.common.base.Preconditions;
|
||||||
* pipeline is a chain of interceptor instances that can inspect and modify the
|
* pipeline is a chain of interceptor instances that can inspect and modify the
|
||||||
* request/response as needed.
|
* request/response as needed.
|
||||||
*/
|
*/
|
||||||
public class AMRMProxyService extends AbstractService implements
|
public class AMRMProxyService extends CompositeService implements
|
||||||
ApplicationMasterProtocol {
|
ApplicationMasterProtocol {
|
||||||
private static final Logger LOG = LoggerFactory
|
private static final Logger LOG = LoggerFactory
|
||||||
.getLogger(AMRMProxyService.class);
|
.getLogger(AMRMProxyService.class);
|
||||||
|
@ -96,6 +101,7 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
private InetSocketAddress listenerEndpoint;
|
private InetSocketAddress listenerEndpoint;
|
||||||
private AMRMProxyTokenSecretManager secretManager;
|
private AMRMProxyTokenSecretManager secretManager;
|
||||||
private Map<ApplicationId, RequestInterceptorChainWrapper> applPipelineMap;
|
private Map<ApplicationId, RequestInterceptorChainWrapper> applPipelineMap;
|
||||||
|
private RegistryOperations registry;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance of the service.
|
* Creates an instance of the service.
|
||||||
|
@ -118,10 +124,23 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void serviceInit(Configuration conf) throws Exception {
|
protected void serviceInit(Configuration conf) throws Exception {
|
||||||
super.serviceInit(conf);
|
|
||||||
this.secretManager =
|
this.secretManager =
|
||||||
new AMRMProxyTokenSecretManager(this.nmContext.getNMStateStore());
|
new AMRMProxyTokenSecretManager(this.nmContext.getNMStateStore());
|
||||||
this.secretManager.init(conf);
|
this.secretManager.init(conf);
|
||||||
|
|
||||||
|
// Both second app attempt and NM restart within Federation need registry
|
||||||
|
if (conf.getBoolean(YarnConfiguration.AMRM_PROXY_HA_ENABLED,
|
||||||
|
YarnConfiguration.DEFAULT_AMRM_PROXY_HA_ENABLED)
|
||||||
|
|| conf.getBoolean(YarnConfiguration.NM_RECOVERY_ENABLED,
|
||||||
|
YarnConfiguration.DEFAULT_NM_RECOVERY_ENABLED)) {
|
||||||
|
this.registry = FederationStateStoreFacade.createInstance(conf,
|
||||||
|
YarnConfiguration.YARN_REGISTRY_CLASS,
|
||||||
|
YarnConfiguration.DEFAULT_YARN_REGISTRY_CLASS,
|
||||||
|
RegistryOperations.class);
|
||||||
|
addService(this.registry);
|
||||||
|
}
|
||||||
|
|
||||||
|
super.serviceInit(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -203,6 +222,8 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
amrmToken = new Token<>();
|
amrmToken = new Token<>();
|
||||||
amrmToken.decodeFromUrlString(
|
amrmToken.decodeFromUrlString(
|
||||||
new String(contextEntry.getValue(), "UTF-8"));
|
new String(contextEntry.getValue(), "UTF-8"));
|
||||||
|
// Clear the service field, as if RM just issued the token
|
||||||
|
amrmToken.setService(new Text());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,12 +235,36 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
throw new IOException("No user found for app attempt " + attemptId);
|
throw new IOException("No user found for app attempt " + attemptId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Regenerate the local AMRMToken for the AM
|
||||||
Token<AMRMTokenIdentifier> localToken =
|
Token<AMRMTokenIdentifier> localToken =
|
||||||
this.secretManager.createAndGetAMRMToken(attemptId);
|
this.secretManager.createAndGetAMRMToken(attemptId);
|
||||||
|
|
||||||
|
// Retrieve the AM container credentials from NM context
|
||||||
|
Credentials amCred = null;
|
||||||
|
for (Container container : this.nmContext.getContainers().values()) {
|
||||||
|
LOG.debug("From NM Context container " + container.getContainerId());
|
||||||
|
if (container.getContainerId().getApplicationAttemptId().equals(
|
||||||
|
attemptId) && container.getContainerTokenIdentifier() != null) {
|
||||||
|
LOG.debug("Container type "
|
||||||
|
+ container.getContainerTokenIdentifier().getContainerType());
|
||||||
|
if (container.getContainerTokenIdentifier()
|
||||||
|
.getContainerType() == ContainerType.APPLICATION_MASTER) {
|
||||||
|
LOG.info("AM container {} found in context, has credentials: {}",
|
||||||
|
container.getContainerId(),
|
||||||
|
(container.getCredentials() != null));
|
||||||
|
amCred = container.getCredentials();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (amCred == null) {
|
||||||
|
LOG.error("No credentials found for AM container of {}. "
|
||||||
|
+ "Yarn registry access might not work", attemptId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the intercepter pipeline for the AM
|
||||||
initializePipeline(attemptId, user, amrmToken, localToken,
|
initializePipeline(attemptId, user, amrmToken, localToken,
|
||||||
entry.getValue(), true);
|
entry.getValue(), true, amCred);
|
||||||
} catch (Exception e) {
|
} catch (IOException e) {
|
||||||
LOG.error("Exception when recovering " + attemptId
|
LOG.error("Exception when recovering " + attemptId
|
||||||
+ ", removing it from NMStateStore and move on", e);
|
+ ", removing it from NMStateStore and move on", e);
|
||||||
this.nmContext.getNMStateStore().removeAMRMProxyAppContext(attemptId);
|
this.nmContext.getNMStateStore().removeAMRMProxyAppContext(attemptId);
|
||||||
|
@ -326,7 +371,7 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
|
|
||||||
initializePipeline(appAttemptId,
|
initializePipeline(appAttemptId,
|
||||||
containerTokenIdentifierForKey.getApplicationSubmitter(), amrmToken,
|
containerTokenIdentifierForKey.getApplicationSubmitter(), amrmToken,
|
||||||
localToken, null, false);
|
localToken, null, false, credentials);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -342,7 +387,8 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
protected void initializePipeline(ApplicationAttemptId applicationAttemptId,
|
protected void initializePipeline(ApplicationAttemptId applicationAttemptId,
|
||||||
String user, Token<AMRMTokenIdentifier> amrmToken,
|
String user, Token<AMRMTokenIdentifier> amrmToken,
|
||||||
Token<AMRMTokenIdentifier> localToken,
|
Token<AMRMTokenIdentifier> localToken,
|
||||||
Map<String, byte[]> recoveredDataMap, boolean isRecovery) {
|
Map<String, byte[]> recoveredDataMap, boolean isRecovery,
|
||||||
|
Credentials credentials) {
|
||||||
RequestInterceptorChainWrapper chainWrapper = null;
|
RequestInterceptorChainWrapper chainWrapper = null;
|
||||||
synchronized (applPipelineMap) {
|
synchronized (applPipelineMap) {
|
||||||
if (applPipelineMap
|
if (applPipelineMap
|
||||||
|
@ -404,8 +450,9 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
try {
|
try {
|
||||||
RequestInterceptor interceptorChain =
|
RequestInterceptor interceptorChain =
|
||||||
this.createRequestInterceptorChain();
|
this.createRequestInterceptorChain();
|
||||||
interceptorChain.init(createApplicationMasterContext(this.nmContext,
|
interceptorChain.init(
|
||||||
applicationAttemptId, user, amrmToken, localToken));
|
createApplicationMasterContext(this.nmContext, applicationAttemptId,
|
||||||
|
user, amrmToken, localToken, credentials, this.registry));
|
||||||
if (isRecovery) {
|
if (isRecovery) {
|
||||||
if (recoveredDataMap == null) {
|
if (recoveredDataMap == null) {
|
||||||
throw new YarnRuntimeException(
|
throw new YarnRuntimeException(
|
||||||
|
@ -497,14 +544,12 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
allocateResponse.setAMRMToken(null);
|
allocateResponse.setAMRMToken(null);
|
||||||
|
|
||||||
org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> newToken =
|
org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> newToken =
|
||||||
new org.apache.hadoop.security.token.Token<AMRMTokenIdentifier>(
|
ConverterUtils.convertFromYarn(token, (Text) null);
|
||||||
token.getIdentifier().array(), token.getPassword().array(),
|
|
||||||
new Text(token.getKind()), new Text(token.getService()));
|
|
||||||
|
|
||||||
context.setAMRMToken(newToken);
|
// Update the AMRMToken in context map, and in NM state store if it is
|
||||||
|
// different
|
||||||
// Update the AMRMToken in context map in NM state store
|
if (context.setAMRMToken(newToken)
|
||||||
if (this.nmContext.getNMStateStore() != null) {
|
&& this.nmContext.getNMStateStore() != null) {
|
||||||
try {
|
try {
|
||||||
this.nmContext.getNMStateStore().storeAMRMProxyAppContextEntry(
|
this.nmContext.getNMStateStore().storeAMRMProxyAppContextEntry(
|
||||||
context.getApplicationAttemptId(), NMSS_AMRMTOKEN_KEY,
|
context.getApplicationAttemptId(), NMSS_AMRMTOKEN_KEY,
|
||||||
|
@ -547,10 +592,12 @@ public class AMRMProxyService extends AbstractService implements
|
||||||
private AMRMProxyApplicationContext createApplicationMasterContext(
|
private AMRMProxyApplicationContext createApplicationMasterContext(
|
||||||
Context context, ApplicationAttemptId applicationAttemptId, String user,
|
Context context, ApplicationAttemptId applicationAttemptId, String user,
|
||||||
Token<AMRMTokenIdentifier> amrmToken,
|
Token<AMRMTokenIdentifier> amrmToken,
|
||||||
Token<AMRMTokenIdentifier> localToken) {
|
Token<AMRMTokenIdentifier> localToken, Credentials credentials,
|
||||||
|
RegistryOperations registryImpl) {
|
||||||
AMRMProxyApplicationContextImpl appContext =
|
AMRMProxyApplicationContextImpl appContext =
|
||||||
new AMRMProxyApplicationContextImpl(context, getConfig(),
|
new AMRMProxyApplicationContextImpl(context, getConfig(),
|
||||||
applicationAttemptId, user, amrmToken, localToken);
|
applicationAttemptId, user, amrmToken, localToken, credentials,
|
||||||
|
registryImpl);
|
||||||
return appContext;
|
return appContext;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,8 @@ import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.security.token.Token;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
|
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||||
|
@ -42,6 +44,7 @@ import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.Container;
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
|
@ -56,17 +59,20 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
|
import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.server.federation.failover.FederationProxyProviderUtil;
|
import org.apache.hadoop.yarn.server.federation.failover.FederationProxyProviderUtil;
|
||||||
import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils;
|
import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils;
|
||||||
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
|
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
|
||||||
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
|
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
|
||||||
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
|
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
|
||||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
|
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
|
||||||
|
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
|
||||||
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
||||||
import org.apache.hadoop.yarn.server.uam.UnmanagedAMPoolManager;
|
import org.apache.hadoop.yarn.server.uam.UnmanagedAMPoolManager;
|
||||||
import org.apache.hadoop.yarn.server.utils.AMRMClientUtils;
|
import org.apache.hadoop.yarn.server.utils.AMRMClientUtils;
|
||||||
import org.apache.hadoop.yarn.server.utils.YarnServerSecurityUtils;
|
import org.apache.hadoop.yarn.server.utils.YarnServerSecurityUtils;
|
||||||
import org.apache.hadoop.yarn.util.AsyncCallback;
|
import org.apache.hadoop.yarn.util.AsyncCallback;
|
||||||
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -145,6 +151,8 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
*/
|
*/
|
||||||
private UserGroupInformation appOwner;
|
private UserGroupInformation appOwner;
|
||||||
|
|
||||||
|
private FederationRegistryClient registryClient;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance of the FederationInterceptor class.
|
* Creates an instance of the FederationInterceptor class.
|
||||||
*/
|
*/
|
||||||
|
@ -179,6 +187,10 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
throw new YarnRuntimeException(ex);
|
throw new YarnRuntimeException(ex);
|
||||||
}
|
}
|
||||||
|
// Add all app tokens for Yarn Registry access
|
||||||
|
if (this.registryClient != null && appContext.getCredentials() != null) {
|
||||||
|
this.appOwner.addCredentials(appContext.getCredentials());
|
||||||
|
}
|
||||||
|
|
||||||
this.homeSubClusterId =
|
this.homeSubClusterId =
|
||||||
SubClusterId.newInstance(YarnConfiguration.getClusterId(conf));
|
SubClusterId.newInstance(YarnConfiguration.getClusterId(conf));
|
||||||
|
@ -192,6 +204,11 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
|
|
||||||
this.uamPool.init(conf);
|
this.uamPool.init(conf);
|
||||||
this.uamPool.start();
|
this.uamPool.start();
|
||||||
|
|
||||||
|
if (appContext.getRegistryClient() != null) {
|
||||||
|
this.registryClient = new FederationRegistryClient(conf,
|
||||||
|
appContext.getRegistryClient(), this.appOwner);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -250,20 +267,27 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
*/
|
*/
|
||||||
this.amRegistrationResponse =
|
this.amRegistrationResponse =
|
||||||
this.homeRM.registerApplicationMaster(request);
|
this.homeRM.registerApplicationMaster(request);
|
||||||
|
if (this.amRegistrationResponse
|
||||||
|
.getContainersFromPreviousAttempts() != null) {
|
||||||
|
cacheAllocatedContainers(
|
||||||
|
this.amRegistrationResponse.getContainersFromPreviousAttempts(),
|
||||||
|
this.homeSubClusterId);
|
||||||
|
}
|
||||||
|
|
||||||
|
ApplicationId appId =
|
||||||
|
getApplicationContext().getApplicationAttemptId().getApplicationId();
|
||||||
|
reAttachUAMAndMergeRegisterResponse(this.amRegistrationResponse, appId);
|
||||||
|
|
||||||
// the queue this application belongs will be used for getting
|
// the queue this application belongs will be used for getting
|
||||||
// AMRMProxy policy from state store.
|
// AMRMProxy policy from state store.
|
||||||
String queue = this.amRegistrationResponse.getQueue();
|
String queue = this.amRegistrationResponse.getQueue();
|
||||||
if (queue == null) {
|
if (queue == null) {
|
||||||
LOG.warn("Received null queue for application "
|
LOG.warn("Received null queue for application " + appId
|
||||||
+ getApplicationContext().getApplicationAttemptId().getApplicationId()
|
+ " from home subcluster. Will use default queue name "
|
||||||
+ " from home sub-cluster. Will use default queue name "
|
|
||||||
+ YarnConfiguration.DEFAULT_QUEUE_NAME
|
+ YarnConfiguration.DEFAULT_QUEUE_NAME
|
||||||
+ " for getting AMRMProxyPolicy");
|
+ " for getting AMRMProxyPolicy");
|
||||||
} else {
|
} else {
|
||||||
LOG.info("Application "
|
LOG.info("Application " + appId + " belongs to queue " + queue);
|
||||||
+ getApplicationContext().getApplicationAttemptId().getApplicationId()
|
|
||||||
+ " belongs to queue " + queue);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the AMRMProxyPolicy
|
// Initialize the AMRMProxyPolicy
|
||||||
|
@ -304,7 +328,7 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
AllocateResponse homeResponse = AMRMClientUtils.allocateWithReRegister(
|
AllocateResponse homeResponse = AMRMClientUtils.allocateWithReRegister(
|
||||||
requests.get(this.homeSubClusterId), this.homeRM,
|
requests.get(this.homeSubClusterId), this.homeRM,
|
||||||
this.amRegistrationRequest,
|
this.amRegistrationRequest,
|
||||||
getApplicationContext().getApplicationAttemptId());
|
getApplicationContext().getApplicationAttemptId().getApplicationId());
|
||||||
|
|
||||||
// Notify policy of home response
|
// Notify policy of home response
|
||||||
try {
|
try {
|
||||||
|
@ -393,8 +417,8 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
// request to the home resource manager on this thread.
|
// request to the home resource manager on this thread.
|
||||||
FinishApplicationMasterResponse homeResponse =
|
FinishApplicationMasterResponse homeResponse =
|
||||||
AMRMClientUtils.finishAMWithReRegister(request, this.homeRM,
|
AMRMClientUtils.finishAMWithReRegister(request, this.homeRM,
|
||||||
this.amRegistrationRequest,
|
this.amRegistrationRequest, getApplicationContext()
|
||||||
getApplicationContext().getApplicationAttemptId());
|
.getApplicationAttemptId().getApplicationId());
|
||||||
|
|
||||||
if (subClusterIds.size() > 0) {
|
if (subClusterIds.size() > 0) {
|
||||||
// Wait for other sub-cluster resource managers to return the
|
// Wait for other sub-cluster resource managers to return the
|
||||||
|
@ -425,6 +449,14 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
|
|
||||||
if (failedToUnRegister) {
|
if (failedToUnRegister) {
|
||||||
homeResponse.setIsUnregistered(false);
|
homeResponse.setIsUnregistered(false);
|
||||||
|
} else {
|
||||||
|
// Clean up UAMs only when the app finishes successfully, so that no more
|
||||||
|
// attempt will be launched.
|
||||||
|
this.uamPool.stop();
|
||||||
|
if (this.registryClient != null) {
|
||||||
|
this.registryClient.removeAppFromRegistry(getApplicationContext()
|
||||||
|
.getApplicationAttemptId().getApplicationId());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return homeResponse;
|
return homeResponse;
|
||||||
}
|
}
|
||||||
|
@ -442,9 +474,8 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void shutdown() {
|
public void shutdown() {
|
||||||
if (this.uamPool != null) {
|
// Do not stop uamPool service and kill UAMs here because of possible second
|
||||||
this.uamPool.stop();
|
// app attempt
|
||||||
}
|
|
||||||
if (threadpool != null) {
|
if (threadpool != null) {
|
||||||
try {
|
try {
|
||||||
threadpool.shutdown();
|
threadpool.shutdown();
|
||||||
|
@ -455,6 +486,16 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
super.shutdown();
|
super.shutdown();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only for unit test cleanup.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
protected void cleanupRegistry() {
|
||||||
|
if (this.registryClient != null) {
|
||||||
|
this.registryClient.cleanAllApplications();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create the UAM pool manager for secondary sub-clsuters. For unit test to
|
* Create the UAM pool manager for secondary sub-clsuters. For unit test to
|
||||||
* override.
|
* override.
|
||||||
|
@ -486,6 +527,120 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void mergeRegisterResponse(
|
||||||
|
RegisterApplicationMasterResponse homeResponse,
|
||||||
|
RegisterApplicationMasterResponse otherResponse) {
|
||||||
|
|
||||||
|
if (!isNullOrEmpty(otherResponse.getContainersFromPreviousAttempts())) {
|
||||||
|
if (!isNullOrEmpty(homeResponse.getContainersFromPreviousAttempts())) {
|
||||||
|
homeResponse.getContainersFromPreviousAttempts()
|
||||||
|
.addAll(otherResponse.getContainersFromPreviousAttempts());
|
||||||
|
} else {
|
||||||
|
homeResponse.setContainersFromPreviousAttempts(
|
||||||
|
otherResponse.getContainersFromPreviousAttempts());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isNullOrEmpty(otherResponse.getNMTokensFromPreviousAttempts())) {
|
||||||
|
if (!isNullOrEmpty(homeResponse.getNMTokensFromPreviousAttempts())) {
|
||||||
|
homeResponse.getNMTokensFromPreviousAttempts()
|
||||||
|
.addAll(otherResponse.getNMTokensFromPreviousAttempts());
|
||||||
|
} else {
|
||||||
|
homeResponse.setNMTokensFromPreviousAttempts(
|
||||||
|
otherResponse.getNMTokensFromPreviousAttempts());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try re-attach to all existing and running UAMs in secondary sub-clusters
|
||||||
|
* launched by previous application attempts if any. All running containers in
|
||||||
|
* the UAMs will be combined into the registerResponse. For the first attempt,
|
||||||
|
* the registry will be empty for this application and thus no-op here.
|
||||||
|
*/
|
||||||
|
protected void reAttachUAMAndMergeRegisterResponse(
|
||||||
|
RegisterApplicationMasterResponse homeResponse,
|
||||||
|
final ApplicationId appId) {
|
||||||
|
|
||||||
|
if (this.registryClient == null) {
|
||||||
|
// Both AMRMProxy HA and NM work preserving restart is not enabled
|
||||||
|
LOG.warn("registryClient is null, skip attaching existing UAM if any");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load existing running UAMs from the previous attempts from
|
||||||
|
// registry, if any
|
||||||
|
Map<String, Token<AMRMTokenIdentifier>> uamMap =
|
||||||
|
this.registryClient.loadStateFromRegistry(appId);
|
||||||
|
if (uamMap.size() == 0) {
|
||||||
|
LOG.info("No existing UAM for application {} found in Yarn Registry",
|
||||||
|
appId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LOG.info("Found {} existing UAMs for application {} in Yarn Registry. "
|
||||||
|
+ "Reattaching in parallel", uamMap.size(), appId);
|
||||||
|
|
||||||
|
ExecutorCompletionService<RegisterApplicationMasterResponse>
|
||||||
|
completionService = new ExecutorCompletionService<>(threadpool);
|
||||||
|
|
||||||
|
for (Entry<String, Token<AMRMTokenIdentifier>> entry : uamMap.entrySet()) {
|
||||||
|
final SubClusterId subClusterId =
|
||||||
|
SubClusterId.newInstance(entry.getKey());
|
||||||
|
final Token<AMRMTokenIdentifier> amrmToken = entry.getValue();
|
||||||
|
|
||||||
|
completionService
|
||||||
|
.submit(new Callable<RegisterApplicationMasterResponse>() {
|
||||||
|
@Override
|
||||||
|
public RegisterApplicationMasterResponse call() throws Exception {
|
||||||
|
RegisterApplicationMasterResponse response = null;
|
||||||
|
try {
|
||||||
|
// Create a config loaded with federation on and subclusterId
|
||||||
|
// for each UAM
|
||||||
|
YarnConfiguration config = new YarnConfiguration(getConf());
|
||||||
|
FederationProxyProviderUtil.updateConfForFederation(config,
|
||||||
|
subClusterId.getId());
|
||||||
|
|
||||||
|
uamPool.reAttachUAM(subClusterId.getId(), config, appId,
|
||||||
|
amRegistrationResponse.getQueue(),
|
||||||
|
getApplicationContext().getUser(), homeSubClusterId.getId(),
|
||||||
|
amrmToken);
|
||||||
|
|
||||||
|
response = uamPool.registerApplicationMaster(
|
||||||
|
subClusterId.getId(), amRegistrationRequest);
|
||||||
|
|
||||||
|
if (response != null
|
||||||
|
&& response.getContainersFromPreviousAttempts() != null) {
|
||||||
|
cacheAllocatedContainers(
|
||||||
|
response.getContainersFromPreviousAttempts(),
|
||||||
|
subClusterId);
|
||||||
|
}
|
||||||
|
LOG.info("UAM {} reattached for {}", subClusterId, appId);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
LOG.error(
|
||||||
|
"Reattaching UAM " + subClusterId + " failed for " + appId,
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for the re-attach responses
|
||||||
|
for (int i = 0; i < uamMap.size(); i++) {
|
||||||
|
try {
|
||||||
|
Future<RegisterApplicationMasterResponse> future =
|
||||||
|
completionService.take();
|
||||||
|
RegisterApplicationMasterResponse registerResponse = future.get();
|
||||||
|
if (registerResponse != null) {
|
||||||
|
LOG.info("Merging register response for {}", appId);
|
||||||
|
mergeRegisterResponse(homeResponse, registerResponse);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.warn("Reattaching UAM failed for ApplicationId: " + appId, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private SubClusterId getSubClusterForNode(String nodeName) {
|
private SubClusterId getSubClusterForNode(String nodeName) {
|
||||||
SubClusterId subClusterId = null;
|
SubClusterId subClusterId = null;
|
||||||
try {
|
try {
|
||||||
|
@ -655,6 +810,20 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
responses.add(response);
|
responses.add(response);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save the new AMRMToken for the UAM in registry if present
|
||||||
|
if (response.getAMRMToken() != null) {
|
||||||
|
Token<AMRMTokenIdentifier> newToken = ConverterUtils
|
||||||
|
.convertFromYarn(response.getAMRMToken(), (Text) null);
|
||||||
|
// Update the token in registry
|
||||||
|
if (registryClient != null) {
|
||||||
|
registryClient
|
||||||
|
.writeAMRMTokenForUAM(
|
||||||
|
getApplicationContext().getApplicationAttemptId()
|
||||||
|
.getApplicationId(),
|
||||||
|
subClusterId.getId(), newToken);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Notify policy of secondary sub-cluster responses
|
// Notify policy of secondary sub-cluster responses
|
||||||
try {
|
try {
|
||||||
policyInterpreter.notifyOfResponse(subClusterId, response);
|
policyInterpreter.notifyOfResponse(subClusterId, response);
|
||||||
|
@ -714,20 +883,23 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
subClusterId);
|
subClusterId);
|
||||||
|
|
||||||
RegisterApplicationMasterResponse uamResponse = null;
|
RegisterApplicationMasterResponse uamResponse = null;
|
||||||
|
Token<AMRMTokenIdentifier> token = null;
|
||||||
try {
|
try {
|
||||||
// For appNameSuffix, use subClusterId of the home sub-cluster
|
// For appNameSuffix, use subClusterId of the home sub-cluster
|
||||||
uamResponse = uamPool.createAndRegisterNewUAM(subClusterId,
|
token = uamPool.launchUAM(subClusterId, config,
|
||||||
registerRequest, config,
|
|
||||||
appContext.getApplicationAttemptId().getApplicationId(),
|
appContext.getApplicationAttemptId().getApplicationId(),
|
||||||
amRegistrationResponse.getQueue(), appContext.getUser(),
|
amRegistrationResponse.getQueue(), appContext.getUser(),
|
||||||
homeSubClusterId.toString());
|
homeSubClusterId.toString(), registryClient != null);
|
||||||
|
|
||||||
|
uamResponse = uamPool.registerApplicationMaster(subClusterId,
|
||||||
|
registerRequest);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
LOG.error("Failed to register application master: "
|
LOG.error("Failed to register application master: "
|
||||||
+ subClusterId + " Application: "
|
+ subClusterId + " Application: "
|
||||||
+ appContext.getApplicationAttemptId(), e);
|
+ appContext.getApplicationAttemptId(), e);
|
||||||
}
|
}
|
||||||
return new RegisterApplicationMasterResponseInfo(uamResponse,
|
return new RegisterApplicationMasterResponseInfo(uamResponse,
|
||||||
SubClusterId.newInstance(subClusterId));
|
SubClusterId.newInstance(subClusterId), token);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -752,6 +924,14 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
+ getApplicationContext().getApplicationAttemptId());
|
+ getApplicationContext().getApplicationAttemptId());
|
||||||
successfulRegistrations.put(uamResponse.getSubClusterId(),
|
successfulRegistrations.put(uamResponse.getSubClusterId(),
|
||||||
uamResponse.getResponse());
|
uamResponse.getResponse());
|
||||||
|
|
||||||
|
if (registryClient != null) {
|
||||||
|
registryClient.writeAMRMTokenForUAM(
|
||||||
|
getApplicationContext().getApplicationAttemptId()
|
||||||
|
.getApplicationId(),
|
||||||
|
uamResponse.getSubClusterId().getId(),
|
||||||
|
uamResponse.getUamToken());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
LOG.warn("Failed to register unmanaged application master: "
|
LOG.warn("Failed to register unmanaged application master: "
|
||||||
|
@ -1087,11 +1267,14 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
private static class RegisterApplicationMasterResponseInfo {
|
private static class RegisterApplicationMasterResponseInfo {
|
||||||
private RegisterApplicationMasterResponse response;
|
private RegisterApplicationMasterResponse response;
|
||||||
private SubClusterId subClusterId;
|
private SubClusterId subClusterId;
|
||||||
|
private Token<AMRMTokenIdentifier> uamToken;
|
||||||
|
|
||||||
RegisterApplicationMasterResponseInfo(
|
RegisterApplicationMasterResponseInfo(
|
||||||
RegisterApplicationMasterResponse response, SubClusterId subClusterId) {
|
RegisterApplicationMasterResponse response, SubClusterId subClusterId,
|
||||||
|
Token<AMRMTokenIdentifier> uamToken) {
|
||||||
this.response = response;
|
this.response = response;
|
||||||
this.subClusterId = subClusterId;
|
this.subClusterId = subClusterId;
|
||||||
|
this.uamToken = uamToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RegisterApplicationMasterResponse getResponse() {
|
public RegisterApplicationMasterResponse getResponse() {
|
||||||
|
@ -1101,6 +1284,10 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
|
||||||
public SubClusterId getSubClusterId() {
|
public SubClusterId getSubClusterId() {
|
||||||
return subClusterId;
|
return subClusterId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Token<AMRMTokenIdentifier> getUamToken() {
|
||||||
|
return uamToken;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -353,10 +353,6 @@ public class ContainerManagerImpl extends CompositeService implements
|
||||||
rsrcLocalizationSrvc.recoverLocalizedResources(
|
rsrcLocalizationSrvc.recoverLocalizedResources(
|
||||||
stateStore.loadLocalizationState());
|
stateStore.loadLocalizationState());
|
||||||
|
|
||||||
if (this.amrmProxyEnabled) {
|
|
||||||
this.getAMRMProxyService().recover();
|
|
||||||
}
|
|
||||||
|
|
||||||
RecoveredApplicationsState appsState = stateStore.loadApplicationsState();
|
RecoveredApplicationsState appsState = stateStore.loadApplicationsState();
|
||||||
for (ContainerManagerApplicationProto proto :
|
for (ContainerManagerApplicationProto proto :
|
||||||
appsState.getApplications()) {
|
appsState.getApplications()) {
|
||||||
|
@ -373,6 +369,11 @@ public class ContainerManagerImpl extends CompositeService implements
|
||||||
recoverContainer(rcs);
|
recoverContainer(rcs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Recovery AMRMProxy state after apps and containers are recovered
|
||||||
|
if (this.amrmProxyEnabled) {
|
||||||
|
this.getAMRMProxyService().recover();
|
||||||
|
}
|
||||||
|
|
||||||
//Dispatching the RECOVERY_COMPLETED event through the dispatcher
|
//Dispatching the RECOVERY_COMPLETED event through the dispatcher
|
||||||
//so that all the paused, scheduled and queued containers will
|
//so that all the paused, scheduled and queued containers will
|
||||||
//be scheduled for execution on availability of resources.
|
//be scheduled for execution on availability of resources.
|
||||||
|
|
|
@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRespo
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.api.records.Priority;
|
import org.apache.hadoop.yarn.api.records.Priority;
|
||||||
|
@ -179,6 +180,15 @@ public abstract class BaseAMRMProxyTest {
|
||||||
return new NMContext(null, null, null, null, stateStore, false, this.conf);
|
return new NMContext(null, null, null, null, stateStore, false, this.conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<ContainerId> getCompletedContainerIds(
|
||||||
|
List<ContainerStatus> containerStatus) {
|
||||||
|
List<ContainerId> ret = new ArrayList<>();
|
||||||
|
for (ContainerStatus status : containerStatus) {
|
||||||
|
ret.add(status.getContainerId());
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This helper method will invoke the specified function in parallel for each
|
* This helper method will invoke the specified function in parallel for each
|
||||||
* end point in the specified list using a thread pool and return the
|
* end point in the specified list using a thread pool and return the
|
||||||
|
@ -623,7 +633,7 @@ public abstract class BaseAMRMProxyTest {
|
||||||
*/
|
*/
|
||||||
public void initApp(ApplicationAttemptId applicationId, String user) {
|
public void initApp(ApplicationAttemptId applicationId, String user) {
|
||||||
super.initializePipeline(applicationId, user,
|
super.initializePipeline(applicationId, user,
|
||||||
new Token<AMRMTokenIdentifier>(), null, null, false);
|
new Token<AMRMTokenIdentifier>(), null, null, false, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void stopApp(ApplicationId applicationId) {
|
public void stopApp(ApplicationId applicationId) {
|
||||||
|
|
|
@ -444,7 +444,7 @@ public class TestAMRMProxyService extends BaseAMRMProxyTest {
|
||||||
|
|
||||||
applicationAttemptId = ApplicationAttemptId.newInstance(appId, 2);
|
applicationAttemptId = ApplicationAttemptId.newInstance(appId, 2);
|
||||||
getAMRMProxyService().initializePipeline(applicationAttemptId, user,
|
getAMRMProxyService().initializePipeline(applicationAttemptId, user,
|
||||||
new Token<AMRMTokenIdentifier>(), null, null, false);
|
new Token<AMRMTokenIdentifier>(), null, null, false, null);
|
||||||
|
|
||||||
RequestInterceptorChainWrapper chain2 =
|
RequestInterceptorChainWrapper chain2 =
|
||||||
getAMRMProxyService().getPipelines().get(appId);
|
getAMRMProxyService().getPipelines().get(appId);
|
||||||
|
@ -531,16 +531,14 @@ public class TestAMRMProxyService extends BaseAMRMProxyTest {
|
||||||
"new AMRMToken from RM should have been nulled by AMRMProxyService",
|
"new AMRMToken from RM should have been nulled by AMRMProxyService",
|
||||||
allocateResponse.getAMRMToken());
|
allocateResponse.getAMRMToken());
|
||||||
|
|
||||||
// The way the mock resource manager is setup, it will return the containers
|
// We need to make sure all the resource managers received the
|
||||||
// that were released in the response. This is done because the UAMs run
|
// release list. The containers sent by the mock resource managers will be
|
||||||
// asynchronously and we need to if all the resource managers received the
|
|
||||||
// release it. The containers sent by the mock resource managers will be
|
|
||||||
// aggregated and returned back to us and we can assert if all the release
|
// aggregated and returned back to us and we can assert if all the release
|
||||||
// lists reached the sub-clusters
|
// lists reached the sub-clusters
|
||||||
List<Container> containersForReleasedContainerIds =
|
List<ContainerId> containersForReleasedContainerIds = new ArrayList<>();
|
||||||
new ArrayList<Container>();
|
List<ContainerId> newlyFinished = getCompletedContainerIds(
|
||||||
containersForReleasedContainerIds.addAll(allocateResponse
|
allocateResponse.getCompletedContainersStatuses());
|
||||||
.getAllocatedContainers());
|
containersForReleasedContainerIds.addAll(newlyFinished);
|
||||||
|
|
||||||
// Send max 10 heart beats to receive all the containers. If not, we will
|
// Send max 10 heart beats to receive all the containers. If not, we will
|
||||||
// fail the test
|
// fail the test
|
||||||
|
@ -554,8 +552,9 @@ public class TestAMRMProxyService extends BaseAMRMProxyTest {
|
||||||
"new AMRMToken from RM should have been nulled by AMRMProxyService",
|
"new AMRMToken from RM should have been nulled by AMRMProxyService",
|
||||||
allocateResponse.getAMRMToken());
|
allocateResponse.getAMRMToken());
|
||||||
|
|
||||||
containersForReleasedContainerIds.addAll(allocateResponse
|
newlyFinished = getCompletedContainerIds(
|
||||||
.getAllocatedContainers());
|
allocateResponse.getCompletedContainersStatuses());
|
||||||
|
containersForReleasedContainerIds.addAll(newlyFinished);
|
||||||
|
|
||||||
LOG.info("Number of containers received in this request: "
|
LOG.info("Number of containers received in this request: "
|
||||||
+ Integer.toString(allocateResponse.getAllocatedContainers()
|
+ Integer.toString(allocateResponse.getAllocatedContainers()
|
||||||
|
|
|
@ -19,16 +19,20 @@
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
|
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.security.PrivilegedExceptionAction;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.Callable;
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ExecutorCompletionService;
|
import java.util.concurrent.ExecutorCompletionService;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
import org.apache.hadoop.registry.client.api.RegistryOperations;
|
||||||
|
import org.apache.hadoop.registry.client.impl.FSRegistryOperationsService;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
|
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
|
||||||
|
@ -59,6 +63,10 @@ import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
|
||||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterRegisterRequest;
|
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterRegisterRequest;
|
||||||
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterState;
|
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterState;
|
||||||
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
import org.apache.hadoop.yarn.server.federation.utils.FederationStateStoreFacade;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||||
import org.apache.hadoop.yarn.util.Records;
|
import org.apache.hadoop.yarn.util.Records;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -79,7 +87,10 @@ public class TestFederationInterceptor extends BaseAMRMProxyTest {
|
||||||
|
|
||||||
private TestableFederationInterceptor interceptor;
|
private TestableFederationInterceptor interceptor;
|
||||||
private MemoryFederationStateStore stateStore;
|
private MemoryFederationStateStore stateStore;
|
||||||
|
private NMStateStoreService nmStateStore;
|
||||||
|
private RegistryOperations registry;
|
||||||
|
|
||||||
|
private Context nmContext;
|
||||||
private int testAppId;
|
private int testAppId;
|
||||||
private ApplicationAttemptId attemptId;
|
private ApplicationAttemptId attemptId;
|
||||||
|
|
||||||
|
@ -93,15 +104,28 @@ public class TestFederationInterceptor extends BaseAMRMProxyTest {
|
||||||
FederationStateStoreFacade.getInstance().reinitialize(stateStore,
|
FederationStateStoreFacade.getInstance().reinitialize(stateStore,
|
||||||
getConf());
|
getConf());
|
||||||
|
|
||||||
|
nmStateStore = new NMMemoryStateStoreService();
|
||||||
|
nmStateStore.init(getConf());
|
||||||
|
nmStateStore.start();
|
||||||
|
|
||||||
|
registry = new FSRegistryOperationsService();
|
||||||
|
registry.init(getConf());
|
||||||
|
registry.start();
|
||||||
|
|
||||||
testAppId = 1;
|
testAppId = 1;
|
||||||
attemptId = getApplicationAttemptId(testAppId);
|
attemptId = getApplicationAttemptId(testAppId);
|
||||||
interceptor.init(new AMRMProxyApplicationContextImpl(null, getConf(),
|
nmContext =
|
||||||
attemptId, "test-user", null, null));
|
new NMContext(null, null, null, null, nmStateStore, false, getConf());
|
||||||
|
interceptor.init(new AMRMProxyApplicationContextImpl(nmContext, getConf(),
|
||||||
|
attemptId, "test-user", null, null, null, registry));
|
||||||
|
interceptor.cleanupRegistry();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void tearDown() {
|
public void tearDown() {
|
||||||
|
interceptor.cleanupRegistry();
|
||||||
interceptor.shutdown();
|
interceptor.shutdown();
|
||||||
|
registry.stop();
|
||||||
super.tearDown();
|
super.tearDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -207,18 +231,17 @@ public class TestFederationInterceptor extends BaseAMRMProxyTest {
|
||||||
AllocateResponse allocateResponse = interceptor.allocate(allocateRequest);
|
AllocateResponse allocateResponse = interceptor.allocate(allocateRequest);
|
||||||
Assert.assertNotNull(allocateResponse);
|
Assert.assertNotNull(allocateResponse);
|
||||||
|
|
||||||
// The way the mock resource manager is setup, it will return the containers
|
// The release request will be split and handled by the corresponding UAM.
|
||||||
// that were released in the allocated containers. The release request will
|
// The release containers returned by the mock resource managers will be
|
||||||
// be split and handled by the corresponding UAM. The release containers
|
// aggregated and returned back to us and we can check if total request size
|
||||||
// returned by the mock resource managers will be aggregated and returned
|
// and returned size are the same
|
||||||
// back to us and we can check if total request size and returned size are
|
List<ContainerId> containersForReleasedContainerIds =
|
||||||
// the same
|
new ArrayList<ContainerId>();
|
||||||
List<Container> containersForReleasedContainerIds =
|
List<ContainerId> newlyFinished = getCompletedContainerIds(
|
||||||
new ArrayList<Container>();
|
allocateResponse.getCompletedContainersStatuses());
|
||||||
containersForReleasedContainerIds
|
containersForReleasedContainerIds.addAll(newlyFinished);
|
||||||
.addAll(allocateResponse.getAllocatedContainers());
|
|
||||||
LOG.info("Number of containers received in the original request: "
|
LOG.info("Number of containers received in the original request: "
|
||||||
+ Integer.toString(allocateResponse.getAllocatedContainers().size()));
|
+ Integer.toString(newlyFinished.size()));
|
||||||
|
|
||||||
// Send max 10 heart beats to receive all the containers. If not, we will
|
// Send max 10 heart beats to receive all the containers. If not, we will
|
||||||
// fail the test
|
// fail the test
|
||||||
|
@ -228,11 +251,12 @@ public class TestFederationInterceptor extends BaseAMRMProxyTest {
|
||||||
allocateResponse =
|
allocateResponse =
|
||||||
interceptor.allocate(Records.newRecord(AllocateRequest.class));
|
interceptor.allocate(Records.newRecord(AllocateRequest.class));
|
||||||
Assert.assertNotNull(allocateResponse);
|
Assert.assertNotNull(allocateResponse);
|
||||||
containersForReleasedContainerIds
|
newlyFinished = getCompletedContainerIds(
|
||||||
.addAll(allocateResponse.getAllocatedContainers());
|
allocateResponse.getCompletedContainersStatuses());
|
||||||
|
containersForReleasedContainerIds.addAll(newlyFinished);
|
||||||
|
|
||||||
LOG.info("Number of containers received in this request: "
|
LOG.info("Number of containers received in this request: "
|
||||||
+ Integer.toString(allocateResponse.getAllocatedContainers().size()));
|
+ Integer.toString(newlyFinished.size()));
|
||||||
LOG.info("Total number of containers received: "
|
LOG.info("Total number of containers received: "
|
||||||
+ Integer.toString(containersForReleasedContainerIds.size()));
|
+ Integer.toString(containersForReleasedContainerIds.size()));
|
||||||
Thread.sleep(10);
|
Thread.sleep(10);
|
||||||
|
@ -547,4 +571,74 @@ public class TestFederationInterceptor extends BaseAMRMProxyTest {
|
||||||
Assert.assertEquals(1, response.getUpdatedContainers().size());
|
Assert.assertEquals(1, response.getUpdatedContainers().size());
|
||||||
Assert.assertEquals(1, response.getUpdateErrors().size());
|
Assert.assertEquals(1, response.getUpdateErrors().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSecondAttempt() throws Exception {
|
||||||
|
ApplicationUserInfo userInfo = getApplicationUserInfo(testAppId);
|
||||||
|
userInfo.getUser().doAs(new PrivilegedExceptionAction<Object>() {
|
||||||
|
@Override
|
||||||
|
public Object run() throws Exception {
|
||||||
|
// Register the application
|
||||||
|
RegisterApplicationMasterRequest registerReq =
|
||||||
|
Records.newRecord(RegisterApplicationMasterRequest.class);
|
||||||
|
registerReq.setHost(Integer.toString(testAppId));
|
||||||
|
registerReq.setRpcPort(testAppId);
|
||||||
|
registerReq.setTrackingUrl("");
|
||||||
|
|
||||||
|
RegisterApplicationMasterResponse registerResponse =
|
||||||
|
interceptor.registerApplicationMaster(registerReq);
|
||||||
|
Assert.assertNotNull(registerResponse);
|
||||||
|
|
||||||
|
Assert.assertEquals(0, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
|
// Allocate one batch of containers
|
||||||
|
registerSubCluster(SubClusterId.newInstance("SC-1"));
|
||||||
|
registerSubCluster(SubClusterId.newInstance(HOME_SC_ID));
|
||||||
|
|
||||||
|
int numberOfContainers = 3;
|
||||||
|
List<Container> containers =
|
||||||
|
getContainersAndAssert(numberOfContainers, numberOfContainers * 2);
|
||||||
|
for (Container c : containers) {
|
||||||
|
System.out.println(c.getId() + " ha");
|
||||||
|
}
|
||||||
|
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
|
||||||
|
// Preserve the mock RM instances for secondaries
|
||||||
|
ConcurrentHashMap<String, MockResourceManagerFacade> secondaries =
|
||||||
|
interceptor.getSecondaryRMs();
|
||||||
|
|
||||||
|
// Increase the attemptId and create a new intercepter instance for it
|
||||||
|
attemptId = ApplicationAttemptId.newInstance(
|
||||||
|
attemptId.getApplicationId(), attemptId.getAttemptId() + 1);
|
||||||
|
|
||||||
|
interceptor = new TestableFederationInterceptor(null, secondaries);
|
||||||
|
interceptor.init(new AMRMProxyApplicationContextImpl(nmContext,
|
||||||
|
getConf(), attemptId, "test-user", null, null, null, registry));
|
||||||
|
registerResponse = interceptor.registerApplicationMaster(registerReq);
|
||||||
|
|
||||||
|
// Should re-attach secondaries and get the three running containers
|
||||||
|
Assert.assertEquals(1, interceptor.getUnmanagedAMPoolSize());
|
||||||
|
Assert.assertEquals(numberOfContainers,
|
||||||
|
registerResponse.getContainersFromPreviousAttempts().size());
|
||||||
|
|
||||||
|
// Release all containers
|
||||||
|
releaseContainersAndAssert(
|
||||||
|
registerResponse.getContainersFromPreviousAttempts());
|
||||||
|
|
||||||
|
// Finish the application
|
||||||
|
FinishApplicationMasterRequest finishReq =
|
||||||
|
Records.newRecord(FinishApplicationMasterRequest.class);
|
||||||
|
finishReq.setDiagnostics("");
|
||||||
|
finishReq.setTrackingUrl("");
|
||||||
|
finishReq.setFinalApplicationStatus(FinalApplicationStatus.SUCCEEDED);
|
||||||
|
|
||||||
|
FinishApplicationMasterResponse finshResponse =
|
||||||
|
interceptor.finishApplicationMaster(finishReq);
|
||||||
|
Assert.assertNotNull(finshResponse);
|
||||||
|
Assert.assertEquals(true, finshResponse.getIsUnregistered());
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,6 +44,15 @@ public class TestableFederationInterceptor extends FederationInterceptor {
|
||||||
private AtomicInteger runningIndex = new AtomicInteger(0);
|
private AtomicInteger runningIndex = new AtomicInteger(0);
|
||||||
private MockResourceManagerFacade mockRm;
|
private MockResourceManagerFacade mockRm;
|
||||||
|
|
||||||
|
public TestableFederationInterceptor() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public TestableFederationInterceptor(MockResourceManagerFacade homeRM,
|
||||||
|
ConcurrentHashMap<String, MockResourceManagerFacade> secondaries) {
|
||||||
|
mockRm = homeRM;
|
||||||
|
secondaryResourceManagers = secondaries;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected UnmanagedAMPoolManager createUnmanagedAMPoolManager(
|
protected UnmanagedAMPoolManager createUnmanagedAMPoolManager(
|
||||||
ExecutorService threadPool) {
|
ExecutorService threadPool) {
|
||||||
|
@ -68,7 +77,7 @@ public class TestableFederationInterceptor extends FederationInterceptor {
|
||||||
// We create one instance of the mock resource manager per sub cluster. Keep
|
// We create one instance of the mock resource manager per sub cluster. Keep
|
||||||
// track of the instances of the RMs in the map keyed by the sub cluster id
|
// track of the instances of the RMs in the map keyed by the sub cluster id
|
||||||
synchronized (this.secondaryResourceManagers) {
|
synchronized (this.secondaryResourceManagers) {
|
||||||
if (this.secondaryResourceManagers.contains(subClusterId)) {
|
if (this.secondaryResourceManagers.containsKey(subClusterId)) {
|
||||||
return (T) this.secondaryResourceManagers.get(subClusterId);
|
return (T) this.secondaryResourceManagers.get(subClusterId);
|
||||||
} else {
|
} else {
|
||||||
// The running index here is used to simulate different RM_EPOCH to
|
// The running index here is used to simulate different RM_EPOCH to
|
||||||
|
@ -91,6 +100,15 @@ public class TestableFederationInterceptor extends FederationInterceptor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected MockResourceManagerFacade getHomeRM() {
|
||||||
|
return mockRm;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected ConcurrentHashMap<String, MockResourceManagerFacade>
|
||||||
|
getSecondaryRMs() {
|
||||||
|
return secondaryResourceManagers;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extends the UnmanagedAMPoolManager and overrides methods to provide a
|
* Extends the UnmanagedAMPoolManager and overrides methods to provide a
|
||||||
* testable implementation of UnmanagedAMPoolManager.
|
* testable implementation of UnmanagedAMPoolManager.
|
||||||
|
@ -104,9 +122,9 @@ public class TestableFederationInterceptor extends FederationInterceptor {
|
||||||
@Override
|
@Override
|
||||||
public UnmanagedApplicationManager createUAM(Configuration conf,
|
public UnmanagedApplicationManager createUAM(Configuration conf,
|
||||||
ApplicationId appId, String queueName, String submitter,
|
ApplicationId appId, String queueName, String submitter,
|
||||||
String appNameSuffix) {
|
String appNameSuffix, boolean keepContainersAcrossApplicationAttempts) {
|
||||||
return new TestableUnmanagedApplicationManager(conf, appId, queueName,
|
return new TestableUnmanagedApplicationManager(conf, appId, queueName,
|
||||||
submitter, appNameSuffix);
|
submitter, appNameSuffix, keepContainersAcrossApplicationAttempts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -119,8 +137,9 @@ public class TestableFederationInterceptor extends FederationInterceptor {
|
||||||
|
|
||||||
public TestableUnmanagedApplicationManager(Configuration conf,
|
public TestableUnmanagedApplicationManager(Configuration conf,
|
||||||
ApplicationId appId, String queueName, String submitter,
|
ApplicationId appId, String queueName, String submitter,
|
||||||
String appNameSuffix) {
|
String appNameSuffix, boolean keepContainersAcrossApplicationAttempts) {
|
||||||
super(conf, appId, queueName, submitter, appNameSuffix);
|
super(conf, appId, queueName, submitter, appNameSuffix,
|
||||||
|
keepContainersAcrossApplicationAttempts);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.ha.HAServiceProtocol;
|
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
import org.apache.hadoop.net.NetUtils;
|
import org.apache.hadoop.net.NetUtils;
|
||||||
|
import org.apache.hadoop.security.Credentials;
|
||||||
import org.apache.hadoop.security.token.Token;
|
import org.apache.hadoop.security.token.Token;
|
||||||
import org.apache.hadoop.net.ServerSocketUtil;
|
import org.apache.hadoop.net.ServerSocketUtil;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
|
@ -951,9 +952,10 @@ public class MiniYARNCluster extends CompositeService {
|
||||||
protected void initializePipeline(ApplicationAttemptId applicationAttemptId,
|
protected void initializePipeline(ApplicationAttemptId applicationAttemptId,
|
||||||
String user, Token<AMRMTokenIdentifier> amrmToken,
|
String user, Token<AMRMTokenIdentifier> amrmToken,
|
||||||
Token<AMRMTokenIdentifier> localToken,
|
Token<AMRMTokenIdentifier> localToken,
|
||||||
Map<String, byte[]> recoveredDataMap, boolean isRecovery) {
|
Map<String, byte[]> recoveredDataMap, boolean isRecovery,
|
||||||
|
Credentials credentials) {
|
||||||
super.initializePipeline(applicationAttemptId, user, amrmToken,
|
super.initializePipeline(applicationAttemptId, user, amrmToken,
|
||||||
localToken, recoveredDataMap, isRecovery);
|
localToken, recoveredDataMap, isRecovery, credentials);
|
||||||
RequestInterceptor rt = getPipelines()
|
RequestInterceptor rt = getPipelines()
|
||||||
.get(applicationAttemptId.getApplicationId()).getRootInterceptor();
|
.get(applicationAttemptId.getApplicationId()).getRootInterceptor();
|
||||||
// The DefaultRequestInterceptor will generally be the last
|
// The DefaultRequestInterceptor will generally be the last
|
||||||
|
|
|
@ -141,6 +141,8 @@ The figure shows a sequence diagram for the following job execution flow:
|
||||||
b. The security tokens are also modified by the NM when launching the AM, so that the AM can only talk with the AMRMProxy. Any future communication from AM to the YARN RM is mediated by the AMRMProxy.
|
b. The security tokens are also modified by the NM when launching the AM, so that the AM can only talk with the AMRMProxy. Any future communication from AM to the YARN RM is mediated by the AMRMProxy.
|
||||||
7. The AM will then request containers using the locality information exposed by HDFS.
|
7. The AM will then request containers using the locality information exposed by HDFS.
|
||||||
8. Based on a policy the AMRMProxy can impersonate the AM on other sub-clusters, by submitting an Unmanaged AM, and by forwarding the AM heartbeats to relevant sub-clusters.
|
8. Based on a policy the AMRMProxy can impersonate the AM on other sub-clusters, by submitting an Unmanaged AM, and by forwarding the AM heartbeats to relevant sub-clusters.
|
||||||
|
a. Federation supports multiple application attempts with AMRMProxy HA. AM containers will have different attempt id in home sub-cluster, but the same Unmanaged AM in secondaries will be used across attempts.
|
||||||
|
b. When AMRMProxy HA is enabled, UAM token will be stored in Yarn Registry. In the registerApplicationMaster call of each application attempt, AMRMProxy will go fetch existing UAM tokens from registry (if any) and re-attached to the existing UAMs.
|
||||||
9. The AMRMProxy will use both locality information and a pluggable policy configured in the state-store to decide whether to forward the resource requests received by the AM to the Home RM or to one (or more) Secondary RMs. In Figure 1, we show the case in which the AMRMProxy decides to forward the request to the secondary RM.
|
9. The AMRMProxy will use both locality information and a pluggable policy configured in the state-store to decide whether to forward the resource requests received by the AM to the Home RM or to one (or more) Secondary RMs. In Figure 1, we show the case in which the AMRMProxy decides to forward the request to the secondary RM.
|
||||||
10. The secondary RM will provide the AMRMProxy with valid container tokens to start a new container on some node in its sub-cluster. This mechanism ensures that each sub-cluster uses its own security tokens and avoids the need for a cluster wide shared secret to create tokens.
|
10. The secondary RM will provide the AMRMProxy with valid container tokens to start a new container on some node in its sub-cluster. This mechanism ensures that each sub-cluster uses its own security tokens and avoids the need for a cluster wide shared secret to create tokens.
|
||||||
11. The AMRMProxy forwards the allocation response back to the AM.
|
11. The AMRMProxy forwards the allocation response back to the AM.
|
||||||
|
@ -262,16 +264,17 @@ These are extra configurations that should appear in the **conf/yarn-site.xml**
|
||||||
|
|
||||||
| Property | Example | Description |
|
| Property | Example | Description |
|
||||||
|:---- |:---- |
|
|:---- |:---- |
|
||||||
| `yarn.nodemanager.amrmproxy.enabled` | `true` | Whether or not the AMRMProxy is enabled.
|
| `yarn.nodemanager.amrmproxy.enabled` | `true` | Whether or not the AMRMProxy is enabled. |
|
||||||
|`yarn.nodemanager.amrmproxy.interceptor-class.pipeline` | `org.apache.hadoop.yarn.server.nodemanager.amrmproxy.FederationInterceptor` | A comma-separated list of interceptors to be run at the amrmproxy. For federation the last step in the pipeline should be the FederationInterceptor.
|
| `yarn.nodemanager.amrmproxy.interceptor-class.pipeline` | `org.apache.hadoop.yarn.server.nodemanager.amrmproxy.FederationInterceptor` | A comma-separated list of interceptors to be run at the amrmproxy. For federation the last step in the pipeline should be the FederationInterceptor. |
|
||||||
| `yarn.client.failover-proxy-provider` | `org.apache.hadoop.yarn.server.federation.failover.FederationRMFailoverProxyProvider` | The class used to connect to the RMs by looking up the membership information in federation state-store. This must be set if federation is enabled, even if RM HA is not enabled.|
|
| `yarn.client.failover-proxy-provider` | `org.apache.hadoop.yarn.server.federation.failover.FederationRMFailoverProxyProvider` | The class used to connect to the RMs by looking up the membership information in federation state-store. This must be set if federation is enabled, even if RM HA is not enabled.|
|
||||||
|
|
||||||
Optional:
|
Optional:
|
||||||
|
|
||||||
| Property | Example | Description |
|
| Property | Example | Description |
|
||||||
|:---- |:---- |
|
|:---- |:---- |
|
||||||
|`yarn.federation.statestore.max-connections` | `1` | The maximum number of parallel connections from each AMRMProxy to the state-store. This value is typically lower than the router one, since we have many AMRMProxy that could burn-through many DB connections quickly. |
|
| `yarn.nodemanager.amrmproxy.ha.enable` | `true` | Whether or not the AMRMProxy HA is enabled for multiple application attempt suppport. |
|
||||||
|`yarn.federation.cache-ttl.secs` | `300` | The time to leave for the AMRMProxy cache. Typically larger than at the router, as the number of AMRMProxy is large, and we want to limit the load to the centralized state-store. |
|
| `yarn.federation.statestore.max-connections` | `1` | The maximum number of parallel connections from each AMRMProxy to the state-store. This value is typically lower than the router one, since we have many AMRMProxy that could burn-through many DB connections quickly. |
|
||||||
|
| `yarn.federation.cache-ttl.secs` | `300` | The time to leave for the AMRMProxy cache. Typically larger than at the router, as the number of AMRMProxy is large, and we want to limit the load to the centralized state-store. |
|
||||||
|
|
||||||
Running a Sample Job
|
Running a Sample Job
|
||||||
--------------------
|
--------------------
|
||||||
|
|
Loading…
Reference in New Issue